# Character-level Convolutional Networks for text Classification


## Data Pre-processing


In [63]:
import pandas as pd
df1 = pd.read_excel("data2.xlsx")

In [1]:
companies = {}
def get_sample_text(sample):
    assert sample['column'][3]['@name'] == 'text'
    return sample['column'][3]['#text']


def get_sample_answers_bank(sample):
    answers = {}
    for i in range(4, 12):
        companies[sample['column'][i]['@name']] = i
        answers[sample['column'][i]['@name']] = None if sample['column'][i]['#text'] == 'NULL'\
            else int(sample['column'][i]['#text'])
    return answers

def get_sample_answers_tkk(sample):
    answers = {}
    for i in range(4, 11):
        companies[sample['column'][i]['@name']] = i
        answers[sample['column'][i]['@name']] = None if sample['column'][i]['#text'] == 'NULL'\
            else int(sample['column'][i]['#text'])
    return answers

def get_sample_id(sample):
    assert sample['column'][0]['@name'] == 'id'
    return int(sample['column'][0]['#text'])


def get_data(filename):
    df = pd.DataFrame()
    with open(filename, "r", encoding='utf-8') as f:
        d = xmltodict.parse(f.read(), process_namespaces=True)
        clean_samples = []
        for sample in d['pma_xml_export']['database']['table']:
            sample_id = get_sample_id(sample)
            text = get_sample_text(sample)
            answers = get_sample_answers_bank(sample)
            for company, answer in answers.items():
                if answer is not None:
                    clean_samples.append((sample_id, text, company, answer))
        df['text'] = [sample[1] for sample in clean_samples]
        df['answer'] = [sample[3] for sample in clean_samples]
        df['company'] = [sample[2] for sample in clean_samples]
        df['sample_id'] = [sample[0] for sample in clean_samples]
    return df

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE 
import xmltodict
import re

train_filename = "bank_train_2016.xml"
test_filename = "banks_test_etalon.xml"

train = get_data(train_filename)
test = get_data(test_filename)


In [10]:
url_replacement = lambda x: re.sub(r'(?:http[^\s]+)($|\s)', r'url\1', x)
user_replacement = lambda x: re.sub(r'(?:@[^\s]+)($|\s)', r'user\1', x)

train['text'] = train['text'].apply(url_replacement)
train['text'] = train['text'].apply(user_replacement)

test['text'] = test['text'].apply(url_replacement)
test['text'] = test['text'].apply(user_replacement)

test['text'] = train['text'].str.lower()
train['text'] = train['text'].str.lower()

In [12]:
train.head()

Unnamed: 0,text,answer,company,sample_id
0,url взять кредит тюмень альфа банк,0,alfabank,1
1,мнение о кредитной карте втб 24 url,0,vtb,2
2,«райффайзенбанк»: снижение ключевой ставки цб ...,0,raiffeisen,3
3,современное состояние кредитного поведения в р...,0,sberbank,4
4,user user главное чтоб банки сбер и втб!!!,1,sberbank,5


In [13]:
import os

import mxnet as mx

### Load the data in memory

Helper functions to read from the .json.gzip files

Let's visualize the data:

### Creating the dataset

In [14]:
import multiprocessing

from mxnet import nd, autograd, gluon
from mxnet.gluon.data import ArrayDataset
from mxnet.gluon.data import DataLoader
import numpy as np

Setting up the parameters for the network

In [15]:
ALPHABET = list("абвеёжзклмнопрдйгсуфхцчшщъыьэюяит0123456789-,;.!?:'\"/\\|_@#$%^&*~`+ =<>()[]{}") # The characters as specified in the paper
ALPHABET_INDEX = {letter: index for index, letter in enumerate(ALPHABET)} # { a: 0, b: 1, etc}
FEATURE_LEN = 157 # max-length in characters for one document
NUM_WORKERS = max(multiprocessing.cpu_count() - 3, 1)# number of workers used in the data loading
BATCH_SIZE = 4000 # number of documents per batch

In [17]:
def encode(text):
    encoded = np.zeros([len(ALPHABET), FEATURE_LEN], dtype='float32')
    review = text.lower()[:FEATURE_LEN-1:-1]
    i = 0
    for letter in text:
        if i >= FEATURE_LEN:
            break;
        if letter in ALPHABET_INDEX:
            encoded[ALPHABET_INDEX[letter]][i] = 1
        i += 1
    return encoded

In [18]:
def transform(x, y):
    return encode(x), y

In [21]:

train_data_X = train['text'].as_matrix()
train_data_Y = train['answer'].as_matrix()
test_data_X = test['text'].as_matrix()
test_data_Y = test['answer'].as_matrix()


  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """


In [22]:
train_dataset = ArrayDataset(train_data_X, train_data_Y).transform(transform)
test_dataset = ArrayDataset(test_data_X, test_data_Y).transform(transform)

Creating the training and testing dataloader, with NUM_WORKERS set to the number of CPU core

In [23]:
batch_size = 4000
train_data_loader = mx.gluon.data.DataLoader(train_dataset, batch_size, shuffle=True)
test_data_loader = mx.gluon.data.DataLoader(test_dataset, batch_size)

## Creation of the network

The context will define where the training takes place, on the CPU or on the GPU

In [24]:
ctx = mx.cpu()

We create the network following the instructions describe in the paper, using the small feature and small output units configuration

![img](data/diagram.png)
![img](data/convolutional_layers.png)
![img](data/dense_layer.png)


Based on the paper we set the following parameters:

In [26]:
NUM_FILTERS = 256 # number of convolutional filters per convolutional layer
NUM_OUTPUTS = 2 # number of classes
FULLY_CONNECTED = 1024 # number of unit in the fully connected dense layer
DROPOUT_RATE = 0.5 # probability of node drop out
LEARNING_RATE = 0.01 # learning rate of the gradient
MOMENTUM = 0.04 # momentum of the gradient
WDECAY = 0.0001 # regularization term to limit size of weights

In [27]:
net = gluon.nn.HybridSequential()
with net.name_scope():
    net.add(gluon.nn.Conv1D(channels=NUM_FILTERS, kernel_size=7, activation='relu'))
    net.add(gluon.nn.MaxPool1D(pool_size=3, strides=3))
    net.add(gluon.nn.Conv1D(channels=NUM_FILTERS, kernel_size=7, activation='relu'))
    net.add(gluon.nn.MaxPool1D(pool_size=3, strides=3))
    net.add(gluon.nn.Conv1D(channels=NUM_FILTERS, kernel_size=3, activation='relu'))
    net.add(gluon.nn.Conv1D(channels=NUM_FILTERS, kernel_size=3, activation='relu'))
    net.add(gluon.nn.Conv1D(channels=NUM_FILTERS, kernel_size=3, activation='relu'))
    net.add(gluon.nn.Conv1D(channels=NUM_FILTERS, kernel_size=3, activation='relu'))
    net.add(gluon.nn.MaxPool1D(pool_size=3, strides=3))
    net.add(gluon.nn.Flatten())
    net.add(gluon.nn.Dense(FULLY_CONNECTED, activation='relu'))
    net.add(gluon.nn.Dropout(DROPOUT_RATE))
    net.add(gluon.nn.Dense(FULLY_CONNECTED, activation='relu'))
    net.add(gluon.nn.Dropout(DROPOUT_RATE))
    net.add(gluon.nn.Dense(NUM_OUTPUTS))


In [28]:
print(net)

HybridSequential(
  (0): Conv1D(None -> 256, kernel_size=(7,), stride=(1,), Activation(relu))
  (1): MaxPool1D(size=(3,), stride=(3,), padding=(0,), ceil_mode=False)
  (2): Conv1D(None -> 256, kernel_size=(7,), stride=(1,), Activation(relu))
  (3): MaxPool1D(size=(3,), stride=(3,), padding=(0,), ceil_mode=False)
  (4): Conv1D(None -> 256, kernel_size=(3,), stride=(1,), Activation(relu))
  (5): Conv1D(None -> 256, kernel_size=(3,), stride=(1,), Activation(relu))
  (6): Conv1D(None -> 256, kernel_size=(3,), stride=(1,), Activation(relu))
  (7): Conv1D(None -> 256, kernel_size=(3,), stride=(1,), Activation(relu))
  (8): MaxPool1D(size=(3,), stride=(3,), padding=(0,), ceil_mode=False)
  (9): Flatten
  (10): Dense(None -> 1024, Activation(relu))
  (11): Dropout(p = 0.5, axes=())
  (12): Dense(None -> 1024, Activation(relu))
  (13): Dropout(p = 0.5, axes=())
  (14): Dense(None -> 2, linear)
)


In [29]:
hybridize = False # for speed improvement, compile the network but no in-depth debugging possible
load_params = False # Load pre-trained model

### Parameter initialization

In [30]:
net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)

### Hybridization

In [31]:
if hybridize:
    net.hybridize(static_alloc=True, static_shape=True)

### Softmax cross-entropy Loss

We are in a multi-class classification problem, so we use the [Softmax Cross entropy loss](https://deepnotes.io/softmax-crossentropy)

In [32]:
softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()

### Optimizer

In [33]:
trainer = gluon.Trainer(net.collect_params(), 'sgd', 
                        {'learning_rate': LEARNING_RATE, 
                         'wd':WDECAY, 
                         'momentum':MOMENTUM})

### Evaluate Accuracy

In [34]:
def evaluate_accuracy(data_iterator, net):
    acc = mx.metric.Accuracy()
    for i, (data, label) in enumerate(data_iterator):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        output = net(data)
        prediction = nd.argmax(output, axis=1)
        acc.update(preds=prediction, labels=label)
    return acc.get()[1]

### Training Loop
We loop through the batches given by the data_loader. These batches have been asynchronously fetched by the workers.

After an epoch, we measure the test_accuracy and save the parameters of the model

In [35]:
start_epoch = 0
number_epochs = 7
smoothing_constant = .01
for e in range(start_epoch, number_epochs):
    for i, (review, label) in enumerate(train_data_loader):
        review = review.as_in_context(ctx)
        label = label.as_in_context(ctx)
        with autograd.record():
            output = net(review)
            loss = softmax_cross_entropy(output, label)
        loss.backward()
        trainer.step(review.shape[0])
        
        # moving average of the loss
        curr_loss = nd.mean(loss)
        moving_loss = (curr_loss if (i == 0) 
                       else (1 - smoothing_constant) * moving_loss + (smoothing_constant) * curr_loss)

        #if (i%200 == 0):
            #print('Batch {}: Instant loss {:.4f}, Moving loss {:.4f}'.format(i,curr_loss.asscalar(), moving_loss.asscalar()))

    test_accuracy = evaluate_accuracy(test_data_loader, net)
    #Save the model using the gluon params format
    #net.save_parameters('crepe_epoch_{}_test_acc_{}.params'.format(e,int(test_accuracy*10000)/100))
    print("Epoch {}. Loss: {:.4f}, Test_acc {:.4f}".format(e, moving_loss.asscalar(), test_accuracy))

Epoch 0. Loss: 0.6934, Test_acc 0.6776
Epoch 1. Loss: 0.6740, Test_acc 0.6776
Epoch 2. Loss: 0.6555, Test_acc 0.6776
Epoch 3. Loss: 0.6381, Test_acc 0.6776
Epoch 4. Loss: 0.6206, Test_acc 0.6776
Epoch 5. Loss: 0.6040, Test_acc 0.6776
Epoch 6. Loss: 0.5892, Test_acc 0.6776


In [32]:
#net.export('crepe', epoch=number_epochs)