# 8. Convolutional Neural Networks

I recommend you take a look at these material first.

* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture13-CNNs.pdf
* http://www.aclweb.org/anthology/D14-1181
* https://github.com/Shawn1993/cnn-text-classification-pytorch
* http://cogcomp.org/Data/QA/QC/

In [5]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter, OrderedDict
import nltk
import re
from copy import deepcopy
flatten = lambda l: [item for sublist in l for item in sublist]
from data_utils_discrim import Dictionary, Corpus

In [6]:
# Hyper Parameters
embed_size = 100 ## right?? comeback
hidden_size = 1024
num_layers = 1
num_epochs = 5
num_samples = 1000   # number of words to be sampled
batch_size = 20
seq_length = 30
learning_rate = 0.002

In [7]:

# Load Penn Treebank Dataset
path_real = './language_model_basic_lstm/data/train.txt'
path_fake = './language_model_basic_lstm/sample2.txt'

corpus = Corpus()
ids_real = corpus.get_data(path_real, batch_size)
print(len(corpus.dictionary))
ids_fake = corpus.get_data(path_fake, batch_size)
vocab_size = len(corpus.dictionary)
print(vocab_size)
num_batches = ids_fake.size(1) // seq_length

print(ids_real.shape, ids_fake.shape)

10000
18755
torch.Size([42068, 30]) torch.Size([27963, 30])


In [8]:
USE_CUDA = torch.cuda.is_available()
# USE_CUDA = False 

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor



In [9]:
def getBatch(batch_size,train_data):
    random.shuffle(train_data)
    sindex=0
    eindex=batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex+batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [10]:
def pad_to_batch(batch):
    x,y = zip(*batch)
    x = list(x)
    y = list(y)
#     print(x,y)
#     max_x = max([s.size(1) for s in x])
#     x_p=[]
#     for i in range(len(batch)):
#         if x[i].size(1)<max_x:
#             x_p.append(torch.cat([x[i],Variable(LongTensor([word2index['<PAD>']]*(max_x-x[i].size(1)))).view(1,-1)],1))
#         else:
#             x_p.append(x[i])
#     return torch.cat(x_p),torch.cat(y).view(-1)
#     print(list(x[:4]), list(y[:4]))

#     return torch.cat(x),torch.cat(list(y)).view(-1)
    x = torch.stack(x).cuda() if USE_CUDA else torch.stack(x)
    return Variable(x), Variable(LongTensor(y))

In [11]:
def prepare_sequence(seq, to_index):
    idxs = list(map(lambda w: to_index[w] if w in to_index.keys() else to_index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

In [12]:
print(ids_fake.shape)


torch.Size([27963, 30])


In [13]:

word2index = corpus.dictionary.word2idx
index2word = corpus.dictionary.idx2word

ids_real_used = ids_real[:ids_fake.shape[0]]
paired_real = list(zip(ids_real_used, [1]*ids_real_used.shape[0]))
paired_fake = list(zip(ids_fake, [0]*ids_fake.shape[0]))

paired = paired_real + paired_fake
random.shuffle(paired)

train_data = paired[:int(len(paired)*0.9)]
test_data = paired[int(len(paired)*0.9):]

print(len(train_data))
# data = torch.cat([ids_fake, ids_real_used], 0)
# print(data.shape)
# print(list(paired_fake))

50333


In [14]:
# print(train_data[:20])

In [15]:
    
# data_p = list(zip(X_p,y_p))
# random.shuffle(data_p)

# train_data = data_p[:int(len(data_p)*0.9)]
# test_data = data_p[int(len(data_p)*0.9):]

### Load Pretrained word vector

you can download pretrained word vector from here https://github.com/mmihaltz/word2vec-GoogleNews-vectors 

In [16]:
import gensim

In [17]:
def load_embed_model():
    from gensim.models import KeyedVectors
    # load the Stanford GloVe model
    filename = 'data/glove.6B.100d.txt.word2vec'
    embed_model = KeyedVectors.load_word2vec_format(filename, binary=False)
    return embed_model

# model = KeyedVectors.load_word2vec_format('../dataset/GoogleNews-vectors-negative300.bin', binary=True)
model = load_embed_model()
# model = gensim.models.KeyedVectors.load_word2vec_format('../dataset/GoogleNews-vectors-negative300.bin', binary=True)

In [18]:
len(model.index2word)

400000

In [19]:
pretrained = []

for i in range(len(word2index)):
    try:
        pretrained.append(model[word2index[i]])
    except:
        pretrained.append(np.random.randn(300))
        
pretrained_vectors = np.vstack(pretrained)

## Modeling 

<img src="../images/08.cnn-for-text-architecture.png">
<center>borrowed image from http://www.aclweb.org/anthology/D14-1181</center>

In [20]:
class  CNNClassifier(nn.Module):
    
    def __init__(self, vocab_size,embedding_dim,output_size,kernel_dim=100,kernel_sizes=[3,4,5],dropout=0.5):
        super(CNNClassifier,self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([nn.Conv2d(1, kernel_dim, (K, embedding_dim)) for K in kernel_sizes])

        # kernal_size = (K,D) 
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(kernel_sizes)*kernel_dim, output_size)
    
    
    def init_weights(self,pretrained_word_vectors=False,is_static=False):
        
        
        if pretrained_vectors == 'none': 
            self.embedding.weight.uniform(-.1,.1)
        else:
            self.embedding.weight = nn.Parameter(torch.from_numpy(pretrained_word_vectors).float())
        if is_static:
            self.embedding.weight.requires_grad = False


    def forward(self, inputs,is_training=False):
#         print("before embed", inputs.shape)
        inputs = self.embedding(inputs).unsqueeze(1) # (B,1,T,D)
#         print("after embed", inputs.shape, )
        inputs = [F.relu(conv(inputs)).squeeze(3) for conv in self.convs] #[(N,Co,W), ...]*len(Ks)
        inputs = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in inputs] #[(N,Co), ...]*len(Ks)

        concated = torch.cat(inputs, 1)

        if is_training:
            concated = self.dropout(concated) # (N,len(Ks)*Co)
        out = self.fc(concated) 
        return F.log_softmax(out)

## Train 

It takes for a while if you use just cpu.

In [21]:
EPOCH=5
BATCH_SIZE=50
KERNEL_SIZES = [3,4,5]
KERNEL_DIM = 100
LR = 0.001

num_targets = 2

In [22]:
model = CNNClassifier(len(word2index), 300, num_targets, KERNEL_DIM, KERNEL_SIZES)
model.init_weights(pretrained_vectors) # initialize embedding matrix using pretrained vectors
# model.init_weights('none') # initialize embedding matrix using pretrained vectors


if USE_CUDA:
    model = model.cuda()
    
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=LR)



In [23]:
for epoch in range(EPOCH):
    losses=[]
    for i,batch in enumerate(getBatch(BATCH_SIZE,train_data)):
        inputs,targets = pad_to_batch(batch)
#         print(inputs)od
        
        model.zero_grad()
        preds = model(inputs,True)
        
        loss = loss_function(preds,targets)
        losses.append(loss.data.tolist()[0])
        loss.backward()
        
        #for param in model.parameters():
        #    param.grad.data.clamp_(-3, 3)
        
        optimizer.step()
        
        if i % 100==0:
            print("[%d/%d] mean_loss : %0.2f" %(epoch,EPOCH,np.mean(losses)))
            losses=[]

[0/5] mean_loss : 0.92
[0/5] mean_loss : 0.15
[0/5] mean_loss : 0.07
[0/5] mean_loss : 0.07
[0/5] mean_loss : 0.05
[0/5] mean_loss : 0.06
[0/5] mean_loss : 0.04
[0/5] mean_loss : 0.04
[0/5] mean_loss : 0.05
[0/5] mean_loss : 0.05
[0/5] mean_loss : 0.04
[1/5] mean_loss : 0.00
[1/5] mean_loss : 0.02
[1/5] mean_loss : 0.03
[1/5] mean_loss : 0.03
[1/5] mean_loss : 0.02
[1/5] mean_loss : 0.02
[1/5] mean_loss : 0.02
[1/5] mean_loss : 0.02
[1/5] mean_loss : 0.03
[1/5] mean_loss : 0.03
[1/5] mean_loss : 0.03
[2/5] mean_loss : 0.00
[2/5] mean_loss : 0.01
[2/5] mean_loss : 0.01
[2/5] mean_loss : 0.01
[2/5] mean_loss : 0.02
[2/5] mean_loss : 0.01
[2/5] mean_loss : 0.01
[2/5] mean_loss : 0.02
[2/5] mean_loss : 0.02
[2/5] mean_loss : 0.02
[2/5] mean_loss : 0.02
[3/5] mean_loss : 0.00
[3/5] mean_loss : 0.00
[3/5] mean_loss : 0.02
[3/5] mean_loss : 0.02
[3/5] mean_loss : 0.01
[3/5] mean_loss : 0.01
[3/5] mean_loss : 0.01
[3/5] mean_loss : 0.01
[3/5] mean_loss : 0.01
[3/5] mean_loss : 0.02
[3/5] mean_

## Test 

In [24]:
accuracy=0

x,y = zip(*test_data)
X = list(x)
Y = list(y)

X = Variable(torch.stack(X))

pred_vals = model(X, False)


In [31]:
pred = pred_vals.max(1)[1]


print(type(pred), type(Y))

acc =0
mach_corr = []
for p,label in zip(pred.data.tolist(),Y):
    if p == label:
        acc += 1
        mach_corr.append(1)
print(100* acc / len(test_data))


<class 'torch.autograd.variable.Variable'> <class 'list'>
99.320579295548


In [44]:
test_sents = [[corpus.dictionary.idx2word[idx] for idx in vec if idx != 0] for vec in x]
test_sents = [' '.join(x) for x in test_sents]   
        
print(test_sents[:2])

comb = zip(test_sents, mach_corr, Y)
comb = [{"text": s, "human":  y, "machine_corr": m } for  s,m,y in comb]
random.shuffle(comb)

import json
with open('frontend/src/data/sents.json', 'w') as fp:
    json.dump(comb[:500], fp, indent=4)


['but our thoughts and prayers go by you love , a <unk> that makes all people say they have it .', 'a world fashion trial has begun called " proprietary " and warned it would lead to inject block .']


In [30]:
## save model
torch.save(model.state_dict(), 'discrim_model')

## to load
# the_model = TheModelClass(*args, **kwargs)
# the_model.load_state_dict(torch.load(PATH))

In [28]:
def classify(model, file, corpus):
    corpus = Corpus()
    ids_real = corpus.get_data(path_real, batch_size)
    print(len(corpus.dictionary))
    ids_fake = corpus.get_data(path_fake, batch_size)
    vocab_size = len(corpus.dictionary)
    print(vocab_size)
    num_batches = ids_fake.size(1) // seq_length

    print(ids_real.shape, ids_fake.shape)
    
    

In [32]:
{'human': 1}

{'human': 1}

## Further topics 

* <a href="https://arxiv.org/pdf/1508.06615.pdf">Character-Aware Neural Language Models</a>
* <a href="https://papers.nips.cc/paper/5782-character-level-convolutional-networks-for-text-classification.pdf">Character level CNN for text classification</a>

## Suggested Reading

* https://blog.statsbot.co/text-classifier-algorithms-in-machine-learning-acc115293278
* <a href="https://arxiv.org/pdf/1607.01759">Bag of Tricks for Efficient Text Classification</a>
* <a href="https://arxiv.org/pdf/1708.02657">Which Encoding is the Best for Text Classification in Chinese, English, Japanese and Korean?</a>