In [1]:
import codecs
import random
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from torch.autograd import Variable
import csv
use_cuda = torch.cuda.is_available()

In [9]:
def load_data(fpath, label_path):
    data = []
    labels = []
    label_name=[]
    label_range = []
    with open(label_path) as txt_file:
        reader = txt_file.readlines()
        for index, line in enumerate(reader):
            label_txt = line.strip()
            if label_txt not in label_name:
                label_name.append(label_txt)
                label_range.append(index)
            label_id = label_name.index(label_txt)
            labels.append(label_id)
        label_range.append(len(reader))

    with codecs.open(fpath, 'r', 'utf-8', errors='ignore') as f:
        lines = f.readlines()
        for idx,l in enumerate(lines):
            l = l.rstrip()
            data.append([l.split(' '), labels[idx]])
    return data,label_name,label_range
data,label_name,label_range = load_data('/home/shiyang/WeSHClass/math/dataset.txt', '/home/shiyang/WeSHClass/math/labels.txt')
perm = np.random.permutation(len(data))
train_perm = perm[:100]
test_perm = perm[100:]
train_data = []
test_data = []
for index in train_perm:
    train_data.append(data[index])
for index in test_perm:
    test_data.append(data[index])


In [10]:
max_sentence_len = max([len(sentence) for sentence, _ in data[:]])
print('sentence maxlen', max_sentence_len)

sentence maxlen 717


In [4]:
class Net(nn.Module):
    def __init__(self, word2vec_path, out_chs, filter_heights,num_class):
        super(Net, self).__init__()
        word_embedding = pd.read_csv(filepath_or_buffer=word2vec_path, header=None, sep=" ", quoting=csv.QUOTE_NONE)
        embedding = word_embedding.values[:,1:]
        dict_len, embed_size = embedding.shape
        dict_len += 1
        unknown_word = np.zeros((1, embed_size))
        concat_embedding = torch.from_numpy(np.concatenate([unknown_word, embedding], axis=0).astype(np.float32))
        self.embedding = nn.Embedding(num_embeddings=dict_len, embedding_dim=embed_size).from_pretrained(concat_embedding)
        self.embedding.weight.requires_grad = False
        self.my_dict = ['中文']
        self.my_dict.extend(list(word_embedding.values[:,0]))
        # nn.Conv1d(in_channels, out_channels, kernel_size, stride=1, padding=0, ...
        self.conv = nn.ModuleList([nn.Conv2d(1, out_chs, (fh, embed_size)) for fh in filter_heights])
        self.dropout = nn.Dropout(.5)
        self.fc1 = nn.Linear(out_chs*len(filter_heights), num_class)
        
    def forward(self, x):
        x = self.embedding(x) # (N, seq_len, embd_dim)
        x = x.unsqueeze(1) # (N, Cin, W, embd_dim), insert Channnel-In dim
#         import pdb;
#         pdb.set_trace()
        # Conv2d
        #    Input : (N,Cin, Hin, Win )
        #    Output: (N,Cout,Hout,Wout) 
        # squeeze(3) means 2D to 1D; (N,Cout,Hout,Wout) -> [(N,Cout,Hout==seq_len)] * len(filter_heights)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.conv]
        # max_pool1d(input, kernel_size, ..
        # (N, Cout, seq_len) --(max_pool1d)--> (N, Cout, 1) --(squeeze(2))--> (N, Cout)
        # [(N, Cout)]  len(filter_heights)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        x = torch.cat(x, 1) # (N, Cout*len(filter_heights))
        x = self.dropout(x)
        x = self.fc1(x)
        return x

In [12]:
def train(model, data, batch_size, n_epoch):
    model.train() # Sets the module in training mode. This has any effect only on modules such as Dropout or BatchNorm.
    if use_cuda:
        model.cuda()
    losses = []
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
    criterion = nn.CrossEntropyLoss().cuda()
    for epoch in range(n_epoch):
        epoch_loss = 0.0
        random.shuffle(data)
        for i in range(0, len(data)-batch_size, batch_size): # discard some last elements
            in_data, labels = [], []
            for sentence, label in data[i: i+batch_size]:
                index_vec = [w2i[w] if w in w2i.keys() else 0 for w in sentence[0:max_sentence_len]]
                pad_len = max(0, max_sentence_len - len(index_vec))
                index_vec += [0] * pad_len
                index_vec = index_vec[:max_sentence_len] ## TBD for same len
                in_data.append(index_vec)
                labels.append(label)
            sent_var = Variable(torch.LongTensor(in_data))
            if use_cuda: sent_var = sent_var.cuda()

            target_var = Variable(torch.LongTensor(labels))
            if use_cuda: target_var = target_var.cuda()
            optimizer.zero_grad()
            score = model(sent_var)
            loss = criterion(score, target_var)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print('epoch: {:d}, loss: {:.3f}'.format(epoch, epoch_loss))
        losses.append(epoch_loss)
    print('Training avg loss: {:.3f}'.format(sum(losses)/len(losses)))
        
    return model, losses

def test(model, data, n_test):
    model.eval()
    correct_1 = 0
    correct_3 = 0
    correct_5 = 0
    for sentence, label in data[:n_test]:
        index_vec = [w2i[w] if w in w2i.keys() else 0 for w in sentence[0:max_sentence_len]]
        sent_var = Variable(torch.LongTensor([index_vec]))
        if use_cuda: sent_var = sent_var.cuda()
        score = model(sent_var)
        pred = torch.topk(score,k=5,dim=1)[1].squeeze(0)
        if label == pred[0].item():
            correct_1 += 1
        if label in pred[:3]:
            correct_3 += 1
        if label in pred[:5]:
            correct_5 += 1
        
        
    print('Test top 1 acc: {:.3f} ({:d}/{:d})'.format(correct_1/n_test, correct_1, n_test))
    print('Test top 3 acc: {:.3f} ({:d}/{:d})'.format(correct_3/n_test, correct_3, n_test))
    print('Test top 5 acc: {:.3f} ({:d}/{:d})'.format(correct_5/n_test, correct_5, n_test))
out_ch = 100
batch_size = 64
n_epoch = 1000
fil = [1,2,3,4]
num_class = len(label_name)
word2vec_path = '/home/shiyang/cnn-for-sentence-classification/dataset/glove.6B.50d.txt'
model = Net(word2vec_path, out_ch, fil,num_class)
i2w = {i:w for i,w in enumerate(model.my_dict)}
w2i = {w:i for i,w in enumerate(model.my_dict)}
model, losses = train(model, train_data, batch_size, n_epoch)

epoch: 0, loss: 3.422
epoch: 1, loss: 3.285
epoch: 2, loss: 2.887
epoch: 3, loss: 2.948
epoch: 4, loss: 2.779
epoch: 5, loss: 2.626
epoch: 6, loss: 2.657
epoch: 7, loss: 2.602
epoch: 8, loss: 2.616
epoch: 9, loss: 2.768
epoch: 10, loss: 2.576
epoch: 11, loss: 2.449
epoch: 12, loss: 2.562
epoch: 13, loss: 2.534
epoch: 14, loss: 2.471
epoch: 15, loss: 2.386
epoch: 16, loss: 2.230
epoch: 17, loss: 2.263
epoch: 18, loss: 2.275
epoch: 19, loss: 2.227
epoch: 20, loss: 2.265
epoch: 21, loss: 2.182
epoch: 22, loss: 2.279
epoch: 23, loss: 2.242
epoch: 24, loss: 2.140
epoch: 25, loss: 2.055
epoch: 26, loss: 2.127
epoch: 27, loss: 2.090
epoch: 28, loss: 2.057
epoch: 29, loss: 2.016
epoch: 30, loss: 1.983
epoch: 31, loss: 1.902
epoch: 32, loss: 1.943
epoch: 33, loss: 1.907
epoch: 34, loss: 1.868
epoch: 35, loss: 1.868
epoch: 36, loss: 1.982
epoch: 37, loss: 1.988
epoch: 38, loss: 1.817
epoch: 39, loss: 1.814
epoch: 40, loss: 1.834
epoch: 41, loss: 1.696
epoch: 42, loss: 1.735
epoch: 43, loss: 1.75

epoch: 348, loss: 0.042
epoch: 349, loss: 0.042
epoch: 350, loss: 0.051
epoch: 351, loss: 0.037
epoch: 352, loss: 0.036
epoch: 353, loss: 0.041
epoch: 354, loss: 0.026
epoch: 355, loss: 0.031
epoch: 356, loss: 0.033
epoch: 357, loss: 0.030
epoch: 358, loss: 0.038
epoch: 359, loss: 0.032
epoch: 360, loss: 0.034
epoch: 361, loss: 0.031
epoch: 362, loss: 0.035
epoch: 363, loss: 0.041
epoch: 364, loss: 0.040
epoch: 365, loss: 0.041
epoch: 366, loss: 0.028
epoch: 367, loss: 0.044
epoch: 368, loss: 0.042
epoch: 369, loss: 0.039
epoch: 370, loss: 0.027
epoch: 371, loss: 0.020
epoch: 372, loss: 0.030
epoch: 373, loss: 0.047
epoch: 374, loss: 0.025
epoch: 375, loss: 0.029
epoch: 376, loss: 0.032
epoch: 377, loss: 0.028
epoch: 378, loss: 0.024
epoch: 379, loss: 0.040
epoch: 380, loss: 0.032
epoch: 381, loss: 0.019
epoch: 382, loss: 0.032
epoch: 383, loss: 0.023
epoch: 384, loss: 0.027
epoch: 385, loss: 0.032
epoch: 386, loss: 0.027
epoch: 387, loss: 0.049
epoch: 388, loss: 0.031
epoch: 389, loss

epoch: 699, loss: 0.008
epoch: 700, loss: 0.008
epoch: 701, loss: 0.012
epoch: 702, loss: 0.007
epoch: 703, loss: 0.009
epoch: 704, loss: 0.009
epoch: 705, loss: 0.007
epoch: 706, loss: 0.010
epoch: 707, loss: 0.008
epoch: 708, loss: 0.009
epoch: 709, loss: 0.010
epoch: 710, loss: 0.008
epoch: 711, loss: 0.008
epoch: 712, loss: 0.010
epoch: 713, loss: 0.016
epoch: 714, loss: 0.011
epoch: 715, loss: 0.006
epoch: 716, loss: 0.009
epoch: 717, loss: 0.013
epoch: 718, loss: 0.008
epoch: 719, loss: 0.012
epoch: 720, loss: 0.008
epoch: 721, loss: 0.009
epoch: 722, loss: 0.009
epoch: 723, loss: 0.007
epoch: 724, loss: 0.008
epoch: 725, loss: 0.008
epoch: 726, loss: 0.006
epoch: 727, loss: 0.006
epoch: 728, loss: 0.009
epoch: 729, loss: 0.009
epoch: 730, loss: 0.012
epoch: 731, loss: 0.006
epoch: 732, loss: 0.012
epoch: 733, loss: 0.013
epoch: 734, loss: 0.014
epoch: 735, loss: 0.010
epoch: 736, loss: 0.009
epoch: 737, loss: 0.011
epoch: 738, loss: 0.006
epoch: 739, loss: 0.008
epoch: 740, loss

In [13]:
test(model, test_data, len(test_data))

Test top 1 acc: 0.307 (20157/65556)
Test top 3 acc: 0.546 (35801/65556)
Test top 5 acc: 0.672 (44070/65556)
