# Character-level Convolutional Networks for text Classification
https://arxiv.org/pdf/1509.01626.pdf


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from torchtext import data
import nltk
import json
from torchtext import vocab
from tqdm import tqdm
import torch
from tensorboardX import SummaryWriter
import random
import chakin
import re
import os
import numpy as np
import sys
import random
import tarfile
import torch.nn.functional as F
import urllib
from torchtext import data
import datetime
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import pandas as pd

In [2]:
NUM_FILTERS = 32 # number of convolutional filters per convolutional layer
NUM_OUTPUTS = 2 # number of classes
FULLY_CONNECTED = 128 # number of unit in the fully connected dense layer
DROPOUT_RATE = 0.2 # probability of node drop out
LEARNING_RATE = 0.1 # learning rate of the gradient
MOMENTUM = 0.9 # momentum of the gradient
WDECAY = 0.00001 # regularization term to limit size of weights

In [3]:
ALPHABET = list("abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+ =<>()[]{}") # The 69 characters as specified in the paper
ALPHABET_INDEX = {letter: index for index, letter in enumerate(ALPHABET)} # { a: 0, b: 1, etc}
FEATURE_LEN = 64 # max-length in characters for one document
BATCH_SIZE = 8 # number of documents per batch
FEATURE_LENGTH = len(list(ALPHABET))

In [4]:
# SEED = 1
# split = 0.80
# data_block = []
# negative_data  = open('rt-polaritydata/rt-polarity.neg',encoding='utf8',errors='ignore').read().splitlines()
# for i in negative_data:
#         data_block.append([str(i.strip()),0]) 
# positve_data  = open('rt-polaritydata/rt-polarity.pos',encoding='utf8',errors='ignore').read().splitlines()
# for i in positve_data:
#         data_block.append([str(i.strip()), 1]) 

In [5]:
# review = [i[0] for i in data_block]
# sentment = [i[1] for i in data_block]

In [6]:
# dataset = pd.DataFrame({'review':review,'sentment':sentment})
# dataset = dataset.sample(frac=1).reset_index(drop=True)
# dataset = dataset.values

In [7]:
dataset = pd.read_csv("labeledTrainData.tsv",sep="\t")
# dataset = pd.DataFrame({'review':review,'sentment':sentment})
dataset = dataset.sample(frac=1).reset_index(drop=True)
dataset = dataset[["review","sentiment"]].values
# print(dataset)

In [8]:
class data_loader():
    def encode(self, text):
        encoded = np.zeros([len(ALPHABET), FEATURE_LEN], dtype='float32')
        review = text.lower()[:FEATURE_LEN-1:-1]
        i = 0
        for letter in text:
            if i >= FEATURE_LEN:
                break;
            if letter in ALPHABET_INDEX:
                encoded[ALPHABET_INDEX[letter]][i] = 1
            i += 1
        return encoded
    
    def transform(self,x, y):
        if y == 0:
            y = [1,0]
        else:
            y = [0,1]
        return self.encode(x), y
    
    def split(self, dataset, split_ratio):
        random.shuffle(dataset)
        train_data = dataset[0:int(len(dataset)*split_ratio)]
        test_data = dataset[int(len(dataset)*split_ratio):]
        return train_data, test_data
    
    def get_train_data(self,train_data):
        
        for i in range(int(len(train_data)/BATCH_SIZE)):
            processed_data = []
            onehot_labels = []
            for review, label in train_data[i*BATCH_SIZE:(i+1)*BATCH_SIZE]:
                x, y  = self.transform(review, label)
                processed_data.append(x)
                onehot_labels.append(y)
            if len(processed_data) == BATCH_SIZE:
                yield np.asarray(processed_data), np.asarray(onehot_labels)

In [9]:
DL = data_loader()
train_iterator, test_iterator = DL.split(dataset, 0.8)
for x,y in DL.get_train_data(train_iterator):
    print(x.shape, y.shape)
    print(y)
    break

(8, 69, 64) (8, 2)
[[0 1]
 [0 1]
 [0 1]
 [0 1]
 [1 0]
 [0 1]
 [0 1]
 [1 0]]


In [10]:
class charcterCNN(torch.nn.Module):
    def __init__(self,NUM_FILTERS, BATCH_SIZE, FULLY_CONNECTED, DROPOUT_RATE, NUM_OUTPUTS, FEATURE_LENGTH):
        super(charcterCNN, self).__init__()
        self.NUM_FILTERS = NUM_FILTERS 
        self.BATCH_SIZE = BATCH_SIZE 
        self.FULLY_CONNECTED  =FULLY_CONNECTED
        self.DROPOUT_RATE = DROPOUT_RATE
        self.NUM_OUTPUTS = NUM_OUTPUTS
        self.FEATURE_LENGTH = FEATURE_LENGTH
        
        self.conv1 = torch.nn.Conv1d(in_channels=FEATURE_LENGTH, out_channels=NUM_FILTERS, kernel_size=3, stride=2)
        self.pool1 = torch.nn.MaxPool1d(kernel_size=3,stride=3)
        self.conv2 = torch.nn.Conv1d(in_channels=NUM_FILTERS, out_channels=NUM_FILTERS, kernel_size=3, stride=2)
        self.pool2 = torch.nn.MaxPool1d(kernel_size=3,stride=3)
        self.conv3 = torch.nn.Conv1d(in_channels=NUM_FILTERS, out_channels=NUM_FILTERS, kernel_size=3, stride=2, padding=1)
        self.conv4 = torch.nn.Conv1d(in_channels=NUM_FILTERS, out_channels=NUM_FILTERS, kernel_size=3, stride=2, padding=1)
        self.conv5 = torch.nn.Conv1d(in_channels=NUM_FILTERS, out_channels=NUM_FILTERS, kernel_size=3, stride=2, padding=1)
        self.conv6 = torch.nn.Conv1d(in_channels=NUM_FILTERS, out_channels=NUM_FILTERS, kernel_size=3, stride=2,padding=1)
        self.pool3 = torch.nn.MaxPool1d(kernel_size=3,stride=3)
        self.dense1 = torch.nn.Linear(NUM_FILTERS, FULLY_CONNECTED)
        self.dropout1 = torch.nn.Dropout(DROPOUT_RATE)
        self.dense2 = torch.nn.Linear(FULLY_CONNECTED, FULLY_CONNECTED)
        self.dropout2 = torch.nn.Dropout(DROPOUT_RATE)
        self.dense3 = torch.nn.Linear(FULLY_CONNECTED, NUM_OUTPUTS)
        
    def forward(self, x):
        conv1_out = F.relu(self.conv1(x))
        pool1_out = self.pool1(conv1_out)
        conv2_out = F.relu(self.conv2(pool1_out))
        pool2_out = self.pool1(conv2_out)
        conv3_out = F.relu(self.conv3(pool2_out))
#         conv4_out = F.relu(self.conv4(conv3_out))
#         conv5_out = F.relu(self.conv5(conv4_out))
#         conv6_out = F.relu(self.conv6(conv5_out))
        pool3_out = self.pool3(conv3_out)
#         print(pool3_out.shape)
        flatten = pool3_out.view(self.BATCH_SIZE,-1)
#         print("flatten.shape : ",flatten.shape)
        dense1_out = F.relu(self.dense1(flatten))
        dropout1_out = self.dropout1(dense1_out)
        dense2_out =  F.relu(self.dense2(dropout1_out))
        dropout2_out = self.dropout2(dense2_out)
        dense3_out = F.relu(self.dense3(dropout2_out))
        return torch.softmax(dense3_out, dim=1)
character_network = charcterCNN(NUM_FILTERS, BATCH_SIZE, FULLY_CONNECTED, DROPOUT_RATE,NUM_OUTPUTS, FEATURE_LENGTH)
character_network = character_network.cuda()

In [11]:
input = torch.randn([8, 69, 32])
character_network(input.to(device)).shape

torch.Size([8, 2])

In [12]:
optimizer = torch.optim.Adam(character_network.parameters(), lr=LEARNING_RATE, weight_decay=WDECAY)
criterion = torch.nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

In [13]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
#     print(preds.shape, y.shape)
    rounded_preds = torch.argmax(preds, dim=1)
    
    correct = (rounded_preds == torch.argmax(y, dim=1)).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [14]:
binary_accuracy(torch.Tensor([[0.1,0.9],[0.1,0.9],[0.1,0.9],[0.1,0.9]]), torch.Tensor([[0,1],[0,1],[1,0],[1,0]]))

tensor(0.5000)

In [15]:
def test_accuracy_calculator(character_network, test_iterator):
    epoch_acc = 0
    for X, Y in DL.get_train_data(test_iterator):
        X  =torch.Tensor(x).to(device)
        Y = torch.Tensor(Y).to(device)
#         print(">>", X.shape, Y.shape)
        predictions = character_network(X)            
        acc = binary_accuracy(predictions.type(torch.FloatTensor), Y.type(torch.FloatTensor))
        epoch_acc += acc.item()
    return  epoch_acc / len(test_iterator)

In [None]:
def train(character_network, iterator,  optimizer, criterion):
    DL = data_loader()
    epoch_loss = 0
    epoch_acc = 0
    train_data, test_data = DL.split(dataset, 0.8)
    for X,Y in DL.get_train_data(train_data):
#         print(x.shape, y.shape)
        X  =torch.Tensor(x).to(device)
        Y = torch.Tensor(Y).to(device)
        optimizer.zero_grad()
        predictions = character_network(X)
        loss = criterion(predictions.type(torch.FloatTensor), Y.type(torch.FloatTensor))
        loss.backward()
        optimizer.step()
        acc = binary_accuracy(predictions.type(torch.FloatTensor), Y.type(torch.FloatTensor))
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return character_network, epoch_loss / len(iterator), epoch_acc / len(iterator)


In [None]:
epochs  = 500
log_interval = 1
loss = []
accuracy = []
test_accuracy = []
writer = SummaryWriter()
for i in range(epochs):
    character_network, epoch_loss, epoch_acc = train(character_network, train_iterator, optimizer, criterion)
    test_acc = test_accuracy_calculator(character_network, test_iterator)
    accuracy.append(epoch_acc)
    loss.append(epoch_loss)
    test_accuracy.append(test_acc)
    writer.add_scalar('Traing Loss',epoch_loss, i)
    writer.add_scalar('Test Accuracy',test_acc, i)
    writer.add_scalar('Train Accuracy',epoch_acc, i)
    print(epoch_loss)
writer.export_scalars_to_json("./all_scalars.json")
writer.close()


0.0924488228738308
0.09518883235156536
0.09429516029357911
0.08860195954144001
0.08540533438920975
0.08154082601070405
0.07792498716413974
0.08994163453876972
0.08307997691333294
0.06881366467475891
0.06648885649442672
0.06487378827631474
0.06403036640286446
0.0634362035870552
0.06311130195856095
0.06295889728963375
0.06292067487239837
0.0629086206138134
0.06290532306134701
0.06290215365588665
0.06290182130336762
0.06290201703906059
0.06290200054943562
0.06290223934054374
0.0629019049346447
0.06290198749005795
0.06290249069333076
0.06290228542089463
0.06290199010074139
0.06290236303508281
0.06290201343894004
0.06290208487510682
0.06290249494314194
0.06290163386464119
0.06290221158266067
0.06290384095013142
0.06290182429850101
0.06290205989778042
0.06290219406187535
0.06290161255896091
0.06290266713202
0.0629025886029005
0.06290187209546566
0.06290186500251294
0.06290189984738827
0.06290224293768405
0.06290164383649827
0.0629022698611021
0.06290161883831025
0.06290200825631619
0.0629024