# 参考
- https://github.com/gaussic/text-classification mxnet(gluon)/pytorch实现
- https://mxnet.incubator.apache.org/tutorials/nlp/cnn.html mxnet(sym scratch) 

# load Data

In [1]:
from __future__ import print_function

from collections import Counter
import itertools
import numpy as np
import re

try:
    # For Python 3.0 and later
    from urllib.request import urlopen
except ImportError:
    # Fall back to Python 2's urllib2
    from urllib2 import urlopen
    
def clean_str(string):
    """
    Tokenization/string cleaning.
    Original from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    
    return string.strip().lower()

def download_sentences(url):
    """
    Download sentences from specified URL. 
    
    Strip trailing newline, convert to Unicode.
    """
    
    remote_file = urlopen(url)
    return [line.decode('Latin1').strip() for line in remote_file.readlines()]
    
def load_data_and_labels():
    """
    Loads polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """

    positive_examples = download_sentences('https://raw.githubusercontent.com/yoonkim/CNN_sentence/master/rt-polarity.pos')
    negative_examples = download_sentences('https://raw.githubusercontent.com/yoonkim/CNN_sentence/master/rt-polarity.neg')
    
    # Tokenize
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent).split(" ") for sent in x_text]

    # Generate labels
    positive_labels = [1 for _ in positive_examples]
    negative_labels = [0 for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return x_text, y


def pad_sentences(sentences, padding_word=""):
    """
    Pads all sentences to be the length of the longest sentence.
    Returns padded sentences.
    """
    sequence_length = max(len(x) for x in sentences)
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
        
    return padded_sentences


def build_vocab(sentences):
    """
    Builds a vocabulary mapping from token to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    
    return vocabulary, vocabulary_inv


def build_input_data(sentences, labels, vocabulary):
    """
    Maps sentences and labels to vectors based on a vocabulary.
    """
    x = np.array([
            [vocabulary[word] for word in sentence]
            for sentence in sentences])
    y = np.array(labels)
    
    return x, y

"""
Loads and preprocesses data for the MR dataset.
Returns input vectors, labels, vocabulary, and inverse vocabulary.
"""
# Load and preprocess data
sentences, labels = load_data_and_labels()
sentences_padded = pad_sentences(sentences)
vocabulary, vocabulary_inv = build_vocab(sentences_padded)
x, y = build_input_data(sentences_padded, labels, vocabulary)

vocab_size = len(vocabulary)

# randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# split train/test set
# there are a total of 10662 labeled examples to train on
x_train, x_test = x_shuffled[:-1000], x_shuffled[-1000:]
y_train, y_test = y_shuffled[:-1000], y_shuffled[-1000:]

sentence_size = x_train.shape[1]

print('Train/Test split: %d/%d' % (len(y_train), len(y_test)))
print('train shape:', x_train.shape)
print('test shape:', x_test.shape)
print('vocab_size', vocab_size)
print('sentence max words', sentence_size)

Train/Test split: 9662/1000
train shape: (9662, 56)
test shape: (1000, 56)
vocab_size 18766
sentence max words 56


# gen train test set 

In [2]:
from mxnet import gluon
batch_size = 50
dataset_train = gluon.data.ArrayDataset(x_train.astype('float32'), y_train.astype('float32'))
train_data = gluon.data.DataLoader(dataset_train, batch_size, shuffle=True)
dataset_test = gluon.data.ArrayDataset(x_test.astype('float32'), y_test.astype('float32'))
test_data = gluon.data.DataLoader(dataset_test, batch_size, shuffle=True)
for data, label in test_data:
    print(data, label)
    break

  Optimizer.opt_registry[name].__name__))


In [4]:
from mxnet import gluon
from mxnet.gluon import nn

import sys
sys.path.append('.')
from mxnet import ndarray as nd
from mxnet import autograd
import utilstmp
import time

def net_structure(num_hidden, num_outputs, ctx, num_embed):
    net = gluon.nn.Sequential()
    num_embed = num_embed # dimensions to embed words into
    filter_size = 3
    
    with net.name_scope():
        net.add(
            nn.Embedding(vocab_size, num_embed),
            nn.Conv1D(num_embed, 3),
            nn.GlobalMaxPool1D(),
            nn.Dropout(0.5),
            nn.Dense(num_outputs)
        )
        print(net)
        print(net.collect_params())
    net.initialize(ctx=ctx)
    return net

def mlp(optimizer='sgd', num_outputs=2, num_hidden=256, weight_scale=.01, learning_rate=0.0005, 
        num_epoch=10, batch_size=50, num_embed=300):
    batch_size = batch_size

    ctx = utilstmp.try_gpu(device_id=1)
    net = net_structure(num_hidden=num_hidden, num_outputs=num_outputs, ctx=ctx, num_embed=num_embed)
    loss = gluon.loss.SoftmaxCrossEntropyLoss()
    trainer = gluon.Trainer(net.collect_params(), optimizer, {'learning_rate': learning_rate})
    utilstmp.train(train_data, test_data, net, loss, trainer, ctx, num_epochs=num_epoch)

In [6]:
mlp(learning_rate=0.003, num_epoch=10, batch_size=5, optimizer='adam', num_embed=128)

Sequential(
  (0): Embedding(18766 -> 128, float32)
  (1): Conv1D(None -> 128, kernel_size=(3,), stride=(1,))
  (2): GlobalMaxPool1D(size=(1,), stride=(1,), padding=(0,), ceil_mode=True)
  (3): Dropout(p = 0.5)
  (4): Dense(None -> 2, linear)
)
sequential1_ (
  Parameter sequential1_embedding0_weight (shape=(18766, 128), dtype=<class 'numpy.float32'>)
  Parameter sequential1_conv0_weight (shape=(128, 0, 3), dtype=<class 'numpy.float32'>)
  Parameter sequential1_conv0_bias (shape=(128,), dtype=<class 'numpy.float32'>)
  Parameter sequential1_dense0_weight (shape=(2, 0), dtype=<class 'numpy.float32'>)
  Parameter sequential1_dense0_bias (shape=(2,), dtype=<class 'numpy.float32'>)
)
Start training on  gpu(1)
Epoch 0. Loss: 0.694, Train acc 0.51, Test acc 0.52, Time 1.3 sec
Epoch 1. Loss: 0.529, Train acc 0.73, Test acc 0.75, Time 1.1 sec
Epoch 2. Loss: 0.195, Train acc 0.92, Test acc 0.75, Time 1.1 sec
Epoch 3. Loss: 0.037, Train acc 0.99, Test acc 0.77, Time 1.1 sec
Epoch 4. Loss: 0.006,