# 参考
- https://github.com/gaussic/text-classification mxnet(gluon)/pytorch实现
- https://mxnet.incubator.apache.org/tutorials/nlp/cnn.html mxnet(sym scratch) 

# load Data

In [1]:
from __future__ import print_function

from collections import Counter
import itertools
import numpy as np
import re

try:
    # For Python 3.0 and later
    from urllib.request import urlopen
except ImportError:
    # Fall back to Python 2's urllib2
    from urllib2 import urlopen
    
def clean_str(string):
    """
    Tokenization/string cleaning.
    Original from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    
    return string.strip().lower()

def download_sentences(url):
    """
    Download sentences from specified URL. 
    
    Strip trailing newline, convert to Unicode.
    """
    
    remote_file = urlopen(url)
    return [line.decode('Latin1').strip() for line in remote_file.readlines()]
    
def load_data_and_labels():
    """
    Loads polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """

    positive_examples = download_sentences('https://raw.githubusercontent.com/yoonkim/CNN_sentence/master/rt-polarity.pos')
    negative_examples = download_sentences('https://raw.githubusercontent.com/yoonkim/CNN_sentence/master/rt-polarity.neg')
    
    # Tokenize
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent).split(" ") for sent in x_text]

    # Generate labels
    positive_labels = [1 for _ in positive_examples]
    negative_labels = [0 for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return x_text, y


def pad_sentences(sentences, padding_word=""):
    """
    Pads all sentences to be the length of the longest sentence.
    Returns padded sentences.
    """
    sequence_length = max(len(x) for x in sentences)
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
        
    return padded_sentences


def build_vocab(sentences):
    """
    Builds a vocabulary mapping from token to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    
    return vocabulary, vocabulary_inv


def build_input_data(sentences, labels, vocabulary):
    """
    Maps sentences and labels to vectors based on a vocabulary.
    """
    x = np.array([
            [vocabulary[word] for word in sentence]
            for sentence in sentences])
    y = np.array(labels)
    
    return x, y

"""
Loads and preprocesses data for the MR dataset.
Returns input vectors, labels, vocabulary, and inverse vocabulary.
"""
# Load and preprocess data
sentences, labels = load_data_and_labels()
sentences_padded = pad_sentences(sentences)
vocabulary, vocabulary_inv = build_vocab(sentences_padded)
x, y = build_input_data(sentences_padded, labels, vocabulary)

vocab_size = len(vocabulary)

# randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# split train/test set
# there are a total of 10662 labeled examples to train on
x_train, x_test = x_shuffled[:-1000], x_shuffled[-1000:]
y_train, y_test = y_shuffled[:-1000], y_shuffled[-1000:]

sentence_size = x_train.shape[1]

print('Train/Test split: %d/%d' % (len(y_train), len(y_test)))
print('train shape:', x_train.shape)
print('test shape:', x_test.shape)
print('vocab_size', vocab_size)
print('sentence max words', sentence_size)

Train/Test split: 9662/1000
train shape: (9662, 56)
test shape: (1000, 56)
vocab_size 18766
sentence max words 56


# gen train test set 

In [2]:
from mxnet import gluon
batch_size = 50
dataset_train = gluon.data.ArrayDataset(x_train.astype('float32'), y_train.astype('float32'))
train_data = gluon.data.DataLoader(dataset_train, batch_size, shuffle=True)
dataset_test = gluon.data.ArrayDataset(x_test.astype('float32'), y_test.astype('float32'))
test_data = gluon.data.DataLoader(dataset_test, batch_size, shuffle=True)
for data, label in test_data:
    print(data, label)
    break

  Optimizer.opt_registry[name].__name__))



[[  1.00000000e+00   1.80000000e+01   7.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  2.50000000e+01   4.95000000e+02   1.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  9.80000000e+01   8.41000000e+02   3.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 ..., 
 [  1.70000000e+01   2.54800000e+03   5.29000000e+02 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  6.00000000e+00   2.66000000e+02   1.10000000e+01 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  8.42200000e+03   7.00000000e+00   2.00000000e+01 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]]
<NDArray 50x56 @cpu(0)> 
[ 1.  0.  1.  1.  1.  1.  0.  1.  0.  0.  1.  1.  0.  1.  0.  1.  0.  0.
  1.  1.  0.  0.  1.  0.  1.  0.  0.  1.  1.  0.  0.  0.  1.  1.  1.  1.
  1.  0.  0.  1.  1.  1.  1.  0.  1.  1.  1.  1.  0.  1.]
<NDArray 50 @cpu(0)>


In [40]:
from mxnet.gluon import Block, nn
from mxnet import ndarray as nd

class Conv_Max_Pooling(Block):
    def __init__(self, filter_size, num_filter, **kwargs):
        super(Conv_Max_Pooling, self).__init__(**kwargs)
        
        self.filter_size = filter_size
        self.num_filter = num_filter
        with self.name_scope():
            self.conv = nn.Conv2D(self.num_filter, self.filter_size, activation='relu')
            self.max_pooling = nn.GlobalMaxPool2D()
    def forward(self, x):
        x = self.conv(x)
        x = self.max_pooling(x)
        return x
    
class TextCNN(Block):
    def __init__(self, vocab_size, num_embed, sentence_size, num_label, num_filter, filter_list=[3, 4, 5], dropout_rate=0.5, **kwargs):
        self.vocab_size = vocab_size
        self.num_embed = num_embed
        self.sentence_size = sentence_size
        self.num_filter = num_filter
        self.filter_list = filter_list
        self.num_label = num_label
        self.dropout_rate = dropout_rate
        
        super(TextCNN, self).__init__(**kwargs)
        with self.name_scope():
            self.embed = nn.Embedding(self.vocab_size, self.num_embed)
            print(self.filter_list)
            self.conv_max_pooling0 = Conv_Max_Pooling((self.filter_list[0], self.num_embed), self.num_filter)
            self.conv_max_pooling1 = Conv_Max_Pooling((self.filter_list[1], self.num_embed), self.num_filter)
            self.conv_max_pooling2 = Conv_Max_Pooling((self.filter_list[2], self.num_embed), self.num_filter)
            self.dropout = nn.Dropout(self.dropout_rate)
            self.dense = nn.Dense(self.num_label)
                
    def forward(self, x):
        x = self.embed(x)
        x = x.reshape(shape=(x.shape[0], 1, self.sentence_size, self.num_embed)) # batch-num_filter-y-x
        x = nd.Concat(self.conv_max_pooling0(x), self.conv_max_pooling1(x), self.conv_max_pooling2(x))
        x = self.dropout(x)
        x = self.dense(x)

        return x

import utilstmp
optimizer = 'adam'
ctx = utilstmp.try_gpu(device_id=1)
softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
net = TextCNN(vocab_size, 300, sentence_size, 2, 100, [3, 4, 5])
net.collect_params().initialize(ctx=ctx)

trainer = gluon.Trainer(net.collect_params(), optimizer, {'learning_rate': 1e-3, 'wd': 0})
utilstmp.train(train_data, test_data, net, softmax_cross_entropy, trainer, ctx, num_epochs=10)

[3, 4, 5]
Start training on  gpu(1)
Epoch 0. Loss: 0.630, Train acc 0.63, Test acc 0.74, Time 3.1 sec
Epoch 1. Loss: 0.358, Train acc 0.85, Test acc 0.77, Time 2.8 sec
Epoch 2. Loss: 0.138, Train acc 0.95, Test acc 0.77, Time 2.8 sec
Epoch 3. Loss: 0.047, Train acc 0.99, Test acc 0.76, Time 2.8 sec
Epoch 4. Loss: 0.018, Train acc 1.00, Test acc 0.75, Time 2.8 sec
Epoch 5. Loss: 0.007, Train acc 1.00, Test acc 0.75, Time 2.8 sec
Epoch 6. Loss: 0.004, Train acc 1.00, Test acc 0.75, Time 2.8 sec
Epoch 7. Loss: 0.002, Train acc 1.00, Test acc 0.75, Time 2.8 sec
Epoch 8. Loss: 0.001, Train acc 1.00, Test acc 0.75, Time 2.8 sec
Epoch 9. Loss: 0.001, Train acc 1.00, Test acc 0.75, Time 2.8 sec


In [30]:
tmp = net.collect_params()

In [31]:
tmp

textcnn4_ (
  Parameter textcnn4_embedding0_weight (shape=(18766, 300), dtype=<class 'numpy.float32'>)
  Parameter textcnn4_conv_max_pooling0_conv0_weight (shape=(100, 0, 3, 300), dtype=<class 'numpy.float32'>)
  Parameter textcnn4_conv_max_pooling0_conv0_bias (shape=(100,), dtype=<class 'numpy.float32'>)
  Parameter textcnn4_conv_max_pooling1_conv0_weight (shape=(100, 0, 4, 300), dtype=<class 'numpy.float32'>)
  Parameter textcnn4_conv_max_pooling1_conv0_bias (shape=(100,), dtype=<class 'numpy.float32'>)
  Parameter textcnn4_conv_max_pooling2_conv0_weight (shape=(100, 0, 5, 300), dtype=<class 'numpy.float32'>)
  Parameter textcnn4_conv_max_pooling2_conv0_bias (shape=(100,), dtype=<class 'numpy.float32'>)
  Parameter textcnn4_dense0_weight (shape=(2, 0), dtype=<class 'numpy.float32'>)
  Parameter textcnn4_dense0_bias (shape=(2,), dtype=<class 'numpy.float32'>)
)

In [38]:
net.embed(data)


[[[ 0.01051739  0.01053851 -0.04507791 ..., -0.02943108 -0.01583498
   -0.05689997]
  [-0.01922515 -0.04759626  0.0615257  ..., -0.03813783 -0.01735882
    0.04050624]
  [-0.06869376 -0.02069202 -0.03471726 ..., -0.064337    0.02569561
    0.03292548]
  ..., 
  [-0.00870891  0.05063449 -0.01444717 ..., -0.05882626 -0.06048116
   -0.03014581]
  [-0.00870891  0.05063449 -0.01444717 ..., -0.05882626 -0.06048116
   -0.03014581]
  [-0.00870891  0.05063449 -0.01444717 ..., -0.05882626 -0.06048116
   -0.03014581]]

 [[-0.01288972  0.00666983 -0.00080863 ...,  0.06368881  0.0672717
    0.00757457]
  [-0.04548897 -0.04362133 -0.04405545 ...,  0.03885452  0.06952807
    0.00391911]
  [ 0.04797121 -0.0221878   0.03784377 ..., -0.0370189   0.06602668
   -0.0055925 ]
  ..., 
  [-0.00870891  0.05063449 -0.01444717 ..., -0.05882626 -0.06048116
   -0.03014581]
  [-0.00870891  0.05063449 -0.01444717 ..., -0.05882626 -0.06048116
   -0.03014581]
  [-0.00870891  0.05063449 -0.01444717 ..., -0.05882626 -0

In [17]:
for k, v in tmp.items():
    print(tmp[k].data())


[[-0.01844883 -0.00608966 -0.01713854 ...,  0.01168765 -0.03602036
   0.04260388]
 [ 0.00130237 -0.01963045 -0.00125761 ...,  0.01075296  0.07433405
   0.01905092]
 [-0.04616839 -0.03550442  0.02774295 ..., -0.04164637 -0.04043136
   0.03241483]
 ..., 
 [-0.04427681  0.03144157 -0.08177496 ...,  0.02111061  0.05858256
  -0.03968523]
 [-0.02061614 -0.00990532 -0.02740752 ..., -0.00576682 -0.02916981
   0.06066374]
 [-0.04974853  0.04050153 -0.04721426 ...,  0.06376306 -0.00339565
   0.06702992]]
<NDArray 18766x128 @gpu(1)>

[[[[ 0.23011425  0.21755975  0.2653915 ]
   [ 0.2183774  -0.03421235  0.23805647]
   [ 0.17041427 -0.13803923  0.22440146]]]


 [[[ 0.00585393  0.02951887 -0.01663677]
   [-0.01449551  0.01731502 -0.00521706]
   [-0.03048176  0.05311435  0.02979257]]]


 [[[ 0.07188193  0.11976178 -0.13060977]
   [-0.01548676 -0.14089131  0.11655298]
   [-0.06868246  0.07114433 -0.13202971]]]


 [[[-0.17845884  0.1449344   0.05649428]
   [ 0.1376044  -0.11901021 -0.0703479 ]
   [-0.

In [37]:
import mxnet
data, label = 0, 0
for d, l in train_data:
    data, label = d, l
    break
# data = data.copyto(ctx)
# label = label.copyto(ctx)
data, label
data = data.copyto(mxnet.gpu(1))

In [5]:
from mxnet import gluon
from mxnet.gluon import nn

import sys
sys.path.append('.')
from mxnet import ndarray as nd
from mxnet import autograd
import utilstmp
import time

def net_structure(num_hidden, num_outputs, ctx, num_embed):
    net = gluon.nn.Sequential()
    num_embed = num_embed # dimensions to embed words into
    filter_size = 3
    
    with net.name_scope():
        net.add(
            nn.Embedding(vocab_size, num_embed),
            nn.Conv1D(1, 3),
            nn.GlobalMaxPool1D(),
            nn.Dropout(0.5),
            nn.Dense(num_outputs)
        )
        print(net)
        print(net.collect_params())
    net.initialize(ctx=ctx)
    return net

def mlp(optimizer='sgd', num_outputs=2, num_hidden=256, weight_scale=.01, learning_rate=0.0005, 
        num_epoch=10, batch_size=50, num_embed=300):
    batch_size = batch_size

    ctx = utilstmp.try_gpu(device_id=1)
    net = net_structure(num_hidden=num_hidden, num_outputs=num_outputs, ctx=ctx, num_embed=num_embed)
    loss = gluon.loss.SoftmaxCrossEntropyLoss()
    
    trainer = gluon.Trainer(net.collect_params(), optimizer, {'learning_rate': learning_rate})
    utilstmp.train(train_data, test_data, net, loss, trainer, ctx, num_epochs=num_epoch)

In [7]:
from __future__ import print_function

from collections import Counter
import itertools
import numpy as np
import re

try:
    # For Python 3.0 and later
    from urllib.request import urlopen
except ImportError:
    # Fall back to Python 2's urllib2
    from urllib2 import urlopen
    
def clean_str(string):
    """
    Tokenization/string cleaning.
    Original from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    
    return string.strip().lower()

def download_sentences(url):
    """
    Download sentences from specified URL. 
    
    Strip trailing newline, convert to Unicode.
    """
    
    remote_file = urlopen(url)
    return [line.decode('Latin1').strip() for line in remote_file.readlines()]
    
def load_data_and_labels():
    """
    Loads polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """

    positive_examples = download_sentences('https://raw.githubusercontent.com/yoonkim/CNN_sentence/master/rt-polarity.pos')
    negative_examples = download_sentences('https://raw.githubusercontent.com/yoonkim/CNN_sentence/master/rt-polarity.neg')
    
    # Tokenize
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent).split(" ") for sent in x_text]

    # Generate labels
    positive_labels = [1 for _ in positive_examples]
    negative_labels = [0 for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return x_text, y


def pad_sentences(sentences, padding_word=""):
    """
    Pads all sentences to be the length of the longest sentence.
    Returns padded sentences.
    """
    sequence_length = max(len(x) for x in sentences)
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
        
    return padded_sentences


def build_vocab(sentences):
    """
    Builds a vocabulary mapping from token to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    
    return vocabulary, vocabulary_inv


def build_input_data(sentences, labels, vocabulary):
    """
    Maps sentences and labels to vectors based on a vocabulary.
    """
    x = np.array([
            [vocabulary[word] for word in sentence]
            for sentence in sentences])
    y = np.array(labels)
    
    return x, y

"""
Loads and preprocesses data for the MR dataset.
Returns input vectors, labels, vocabulary, and inverse vocabulary.
"""
# Load and preprocess data
sentences, labels = load_data_and_labels()
sentences_padded = pad_sentences(sentences)
vocabulary, vocabulary_inv = build_vocab(sentences_padded)
x, y = build_input_data(sentences_padded, labels, vocabulary)

vocab_size = len(vocabulary)

# randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# split train/dev set
# there are a total of 10662 labeled examples to train on
x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]

sentence_size = x_train.shape[1]

print('Train/Dev split: %d/%d' % (len(y_train), len(y_dev)))
print('train shape:', x_train.shape)
print('dev shape:', x_dev.shape)
print('vocab_size', vocab_size)
print('sentence max words', sentence_size)

import mxnet as mx
import sys,os

'''
Define batch size and the place holders for network inputs and outputs
'''

batch_size = 50
print('batch size', batch_size)

input_x = mx.sym.Variable('data') # placeholder for input data
input_y = mx.sym.Variable('softmax_label') # placeholder for output label


'''
Define the first network layer (embedding)
'''

# create embedding layer to learn representation of words in a lower dimensional subspace (much like word2vec)
num_embed = 300 # dimensions to embed words into
print('embedding dimensions', num_embed)

embed_layer = mx.sym.Embedding(data=input_x, input_dim=vocab_size, output_dim=num_embed, name='vocab_embed')

# reshape embedded data for next layer
conv_input = mx.sym.Reshape(data=embed_layer, target_shape=(batch_size, 1, sentence_size, num_embed))

# create convolution + (max) pooling layer for each filter operation
filter_list=[3, 4, 5] # the size of filters to use
print('convolution filters', filter_list)

num_filter=100
pooled_outputs = []
for filter_size in filter_list:
    convi = mx.sym.Convolution(data=conv_input, kernel=(filter_size, num_embed), num_filter=num_filter)
    relui = mx.sym.Activation(data=convi, act_type='relu')
    pooli = mx.sym.Pooling(data=relui, pool_type='max', kernel=(sentence_size - filter_size + 1, 1), stride=(1, 1))
    pooled_outputs.append(pooli)

# combine all pooled outputs
total_filters = num_filter * len(filter_list)
concat = mx.sym.Concat(*pooled_outputs, dim=1)

# reshape for next layer
h_pool = mx.sym.Reshape(data=concat, target_shape=(batch_size, total_filters))

# dropout layer
dropout = 0.5
print('dropout probability', dropout)

if dropout > 0.0:
    h_drop = mx.sym.Dropout(data=h_pool, p=dropout)
else:
    h_drop = h_pool
    
# fully connected layer
num_label = 2

cls_weight = mx.sym.Variable('cls_weight')
cls_bias = mx.sym.Variable('cls_bias')

fc = mx.sym.FullyConnected(data=h_drop, weight=cls_weight, bias=cls_bias, num_hidden=num_label)

# softmax output
sm = mx.sym.SoftmaxOutput(data=fc, label=input_y, name='softmax')

# set CNN pointer to the "back" of the network
cnn = sm

from collections import namedtuple
import math
import time

# Define the structure of our CNN Model (as a named tuple)
CNNModel = namedtuple("CNNModel", ['cnn_exec', 'symbol', 'data', 'label', 'param_blocks'])

# Define what device to train/test on
ctx = mx.gpu(0)
# If you have no GPU on your machine change this to
# ctx = mx.cpu(0)

arg_names = cnn.list_arguments()

input_shapes = {}
input_shapes['data'] = (batch_size, sentence_size)

arg_shape, out_shape, aux_shape = cnn.infer_shape(**input_shapes)
arg_arrays = [mx.nd.zeros(s, ctx) for s in arg_shape]
args_grad = {}
for shape, name in zip(arg_shape, arg_names):
    if name in ['softmax_label', 'data']: # input, output
        continue
    args_grad[name] = mx.nd.zeros(shape, ctx)

cnn_exec = cnn.bind(ctx=ctx, args=arg_arrays, args_grad=args_grad, grad_req='add')

param_blocks = []
arg_dict = dict(zip(arg_names, cnn_exec.arg_arrays))
initializer = mx.initializer.Uniform(0.1)
for i, name in enumerate(arg_names):
    if name in ['softmax_label', 'data']: # input, output
        continue
    initializer(mx.init.InitDesc(name), arg_dict[name])

    param_blocks.append( (i, arg_dict[name], args_grad[name], name) )

data = cnn_exec.arg_dict['data']
label = cnn_exec.arg_dict['softmax_label']

cnn_model= CNNModel(cnn_exec=cnn_exec, symbol=cnn, data=data, label=label, param_blocks=param_blocks)

'''
Train the cnn_model using back prop
'''

optimizer = 'rmsprop'
max_grad_norm = 5.0
learning_rate = 0.0005
epoch = 10

print('optimizer', optimizer)
print('maximum gradient', max_grad_norm)
print('learning rate (step size)', learning_rate)
print('epochs to train for', epoch)

# create optimizer
opt = mx.optimizer.create(optimizer)
opt.lr = learning_rate

updater = mx.optimizer.get_updater(opt)

# For each training epoch
for iteration in range(epoch):
    tic = time.time()
    num_correct = 0
    num_total = 0

    # Over each batch of training data
    for begin in range(0, x_train.shape[0], batch_size):
        batchX = x_train[begin:begin+batch_size]
        batchY = y_train[begin:begin+batch_size]
        if batchX.shape[0] != batch_size:
            continue

        cnn_model.data[:] = batchX
        cnn_model.label[:] = batchY

        # forward
        cnn_model.cnn_exec.forward(is_train=True)

        # backward
        cnn_model.cnn_exec.backward()

        # eval on training data
        num_correct += sum(batchY == np.argmax(cnn_model.cnn_exec.outputs[0].asnumpy(), axis=1))
        num_total += len(batchY)

        # update weights
        norm = 0
        for idx, weight, grad, name in cnn_model.param_blocks:
            grad /= batch_size
            l2_norm = mx.nd.norm(grad).asscalar()
            norm += l2_norm * l2_norm

        norm = math.sqrt(norm)
        for idx, weight, grad, name in cnn_model.param_blocks:
            if norm > max_grad_norm:
                grad *= (max_grad_norm / norm)

            updater(idx, grad, weight)

            # reset gradient to zero
            grad[:] = 0.0

    # Decay learning rate for this epoch to ensure we are not "overshooting" optima
    if iteration % 50 == 0 and iteration > 0:
        opt.lr *= 0.5
        print('reset learning rate to %g' % opt.lr)

    # End of training loop for this epoch
    toc = time.time()
    train_time = toc - tic
    train_acc = num_correct * 100 / float(num_total)

    # Saving checkpoint to disk
    if (iteration + 1) % 10 == 0:
        prefix = 'cnn'
        cnn_model.symbol.save('./%s-symbol.json' % prefix)
        save_dict = {('arg:%s' % k) : v  for k, v in cnn_model.cnn_exec.arg_dict.items()}
        save_dict.update({('aux:%s' % k) : v for k, v in cnn_model.cnn_exec.aux_dict.items()})
        param_name = './%s-%04d.params' % (prefix, iteration)
        mx.nd.save(param_name, save_dict)
        print('Saved checkpoint to %s' % param_name)


    # Evaluate model after this epoch on dev (test) set
    num_correct = 0
    num_total = 0

    # For each test batch
    for begin in range(0, x_dev.shape[0], batch_size):
        batchX = x_dev[begin:begin+batch_size]
        batchY = y_dev[begin:begin+batch_size]

        if batchX.shape[0] != batch_size:
            continue

        cnn_model.data[:] = batchX
        cnn_model.cnn_exec.forward(is_train=False)

        num_correct += sum(batchY == np.argmax(cnn_model.cnn_exec.outputs[0].asnumpy(), axis=1))
        num_total += len(batchY)

    dev_acc = num_correct * 100 / float(num_total)
    print('Iter [%d] Train: Time: %.3fs, Training Accuracy: %.3f \
            --- Dev Accuracy thus far: %.3f' % (iteration, train_time, train_acc, dev_acc))



















  string = re.sub(r"\(", " \( ", string)
  string = re.sub(r"\)", " \) ", string)
  string = re.sub(r"\?", " \? ", string)


Train/Dev split: 9662/1000
train shape: (9662, 56)
dev shape: (1000, 56)
vocab_size 18766
sentence max words 56
batch size 50
embedding dimensions 300
convolution filters [3, 4, 5]
dropout probability 0.5
optimizer rmsprop
maximum gradient 5.0
learning rate (step size) 0.0005
epochs to train for 10
Iter [0] Train: Time: 3.059s, Training Accuracy: 56.705             --- Dev Accuracy thus far: 62.500
Iter [1] Train: Time: 2.907s, Training Accuracy: 71.461             --- Dev Accuracy thus far: 69.800
Iter [2] Train: Time: 2.862s, Training Accuracy: 81.876             --- Dev Accuracy thus far: 74.700
Iter [3] Train: Time: 2.857s, Training Accuracy: 87.762             --- Dev Accuracy thus far: 76.200
Iter [4] Train: Time: 2.861s, Training Accuracy: 91.720             --- Dev Accuracy thus far: 77.200
Iter [5] Train: Time: 2.863s, Training Accuracy: 94.363             --- Dev Accuracy thus far: 77.200
Iter [6] Train: Time: 2.866s, Training Accuracy: 96.114             --- Dev Accuracy thu

In [24]:
for l in cnn_model.param_blocks:
    for v in l:
        if(type(v) != int and type(v) != str):
            print(v.shape)

(18766, 300)
(18766, 300)
(100, 1, 3, 300)
(100, 1, 3, 300)
(100,)
(100,)
(100, 1, 4, 300)
(100, 1, 4, 300)
(100,)
(100,)
(100, 1, 5, 300)
(100, 1, 5, 300)
(100,)
(100,)
(2, 300)
(2, 300)
(2,)
(2,)


In [25]:
tmp

textcnn1_ (
  Parameter textcnn1_embedding0_weight (shape=(18766, 128), dtype=<class 'numpy.float32'>)
  Parameter textcnn1_conv_max_pooling0_conv0_weight (shape=(100, 1, 3, 3), dtype=<class 'numpy.float32'>)
  Parameter textcnn1_conv_max_pooling0_conv0_bias (shape=(100,), dtype=<class 'numpy.float32'>)
  Parameter textcnn1_conv_max_pooling1_conv0_weight (shape=(100, 1, 4, 4), dtype=<class 'numpy.float32'>)
  Parameter textcnn1_conv_max_pooling1_conv0_bias (shape=(100,), dtype=<class 'numpy.float32'>)
  Parameter textcnn1_conv_max_pooling2_conv0_weight (shape=(100, 1, 5, 5), dtype=<class 'numpy.float32'>)
  Parameter textcnn1_conv_max_pooling2_conv0_bias (shape=(100,), dtype=<class 'numpy.float32'>)
  Parameter textcnn1_dense0_weight (shape=(2, 300), dtype=<class 'numpy.float32'>)
  Parameter textcnn1_dense0_bias (shape=(2,), dtype=<class 'numpy.float32'>)
)

In [23]:
type(cnn_model.param_blocks[1][1])

mxnet.ndarray.ndarray.NDArray