# Bag Of Word

We want to realize the model as shown below:



![Bag Of Word](./img/bow.png)



In here, we are doing a text classfication task.

In [1]:
from collections import defaultdict
import time
import random
import torch
import numpy as np
import torch.nn as nn
from torch.nn import Parameter
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F

In [2]:
# this is a class to contorl all the hyper parameters
class Config(object):
    def __init__(self):
        self.lr = 1e-3
        self.epoch_num = 10
        self.train_path = "../data/classes/train.txt"
        self.test_path = "../data/classes/test.txt"
        
config = Config()

In [3]:
# Functions to read in the corpus
w2i = defaultdict(lambda: len(w2i)) # we can add the index automatically in this way
t2i = defaultdict(lambda: len(t2i))
UNK = w2i["<unk>"]
def read_dataset(filename):
  with open(filename, "r") as f:
    for line in f:
      tag, words = line.lower().strip().split(" ||| ")
      yield ([w2i[x] for x in words.split(" ")], t2i[tag])
# the dataset just like ([id1, id2, id3, ...], tag1)
# [id1, id2, id3, ...] present the sentence, tag1 present the semantic

In [4]:
# Read in the data
train = list(read_dataset(config.train_path))
w2i = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset(config.test_path))
nwords = len(w2i)
ntags = len(t2i)

In [5]:
# build the model
class BOW(nn.Module):
    def __init__(self, vocab_size, tag_size):
        super(BOW, self).__init__()
        self.embed = nn.Embedding(vocab_size, tag_size)
        self.bow_bias = Parameter(torch.Tensor(tag_size))

    def forward(self, x):
        embeds = self.embed(x) # b*w*t   in this case, b = batch = 1
        word_score = torch.sum(embeds, 1) # b*w*t -> bxt
        scores = word_score.add_(self.bow_bias)
        out = F.log_softmax(scores)
        return out

In [6]:
# create model
bow = BOW(nwords, ntags)

# optim and loss
optimizer = optim.Adam(bow.parameters(), lr=config.lr)
loss_fn = torch.nn.NLLLoss() # loss(bow(input), target) and last layer of model is LogSoftmax

In [7]:
# 开始训练
for epoch in range(config.epoch_num):
    random.shuffle(train)
    train_loss = 0.0
    start = time.time()
    for words, tag in train:
        words = Variable(torch.LongTensor(words)).view(1, -1)
        tag = Variable(torch.LongTensor([tag]))
        bow.zero_grad()
        my_loss = loss_fn(bow(words), tag)
        train_loss += my_loss.data[0]
        my_loss.backward()
        optimizer.step()
    print("iter %r: train loss/sent=%.4f, time=%.2fs" % (epoch, train_loss/len(train), time.time()-start))
    
    # Perform testing
    test_correct = 0.0
    for words, tag in dev:
        words = Variable(torch.LongTensor(words)).view(1, -1)
        scores = bow(words).data.numpy()
        predict = np.argmax(scores)
        if predict == tag:
            test_correct += 1
    print("iter %r: test acc=%.4f" % (epoch, test_correct/len(dev)))

iter 0: train loss/sent=4.4428, time=31.09s
iter 0: test acc=0.2222
iter 1: train loss/sent=3.3411, time=47.01s
iter 1: test acc=0.2480
iter 2: train loss/sent=2.6772, time=45.05s
iter 2: test acc=0.2656
iter 3: train loss/sent=2.2217, time=44.12s
iter 3: test acc=0.2805
iter 4: train loss/sent=1.8787, time=43.52s
iter 4: test acc=0.2955
iter 5: train loss/sent=1.6076, time=43.66s
iter 5: test acc=0.2982
iter 6: train loss/sent=1.3876, time=43.48s
iter 6: test acc=0.2959
iter 7: train loss/sent=1.2050, time=43.60s
iter 7: test acc=0.3109
iter 8: train loss/sent=1.0544, time=43.54s
iter 8: test acc=0.3181
iter 9: train loss/sent=0.9263, time=43.58s
iter 9: test acc=0.3136
