# Continuous Bag of Words (CBOW)


We want to realize the model as shown below:



![Bag Of Word](./img/cbow.png)



In here, we are doing a text classfication task.

In [1]:
from collections import defaultdict
import time
import random
import torch
import numpy as np
import torch.nn as nn
from torch.nn import Parameter
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F

In [2]:
# this is a class to contorl all the hyper parameters
class Config(object):
    def __init__(self):
        self.lr = 1e-3
        self.epoch_num = 2 # as a example, we just take 2 epoch
        self.embedding_size = 64
        self.train_path = "../data/classes/train.txt"
        self.test_path = "../data/classes/test.txt"
        
config = Config()

In [3]:
# Functions to read in the corpus
w2i = defaultdict(lambda: len(w2i)) # we can add the index automatically in this way
t2i = defaultdict(lambda: len(t2i))
UNK = w2i["<unk>"]
def read_dataset(filename):
  with open(filename, "r") as f:
    for line in f:
      tag, words = line.lower().strip().split(" ||| ")
      yield ([w2i[x] for x in words.split(" ")], t2i[tag])
# the dataset just like ([id1, id2, id3, ...], tag1)
# [id1, id2, id3, ...] present the sentence, tag1 present the semantic

In [4]:
# Read in the data
train = list(read_dataset("../data/classes/train.txt"))
w2i = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset("../data/classes/test.txt"))
nwords = len(w2i)
ntags = len(t2i)

In [5]:
# build the model
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_size, tag_size):
        super(CBOW, self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.linear = nn.Linear(embedding_size, tag_size, bias=False)
        self.bow_bias = Parameter(torch.Tensor(tag_size))
        self.bow_bias.data.uniform_(-1, 1)
        
    def forward(self, x):
        embeds = self.embed(x) # b*w*e
        embed_score = torch.sum(embeds, 1) # b*w*e -> b*h
        word_score = self.linear(embed_score) # b*h -> b*t
        scores = word_score.add_(self.bow_bias)
        out = F.log_softmax(scores)
        return out

In [6]:
# create model
model = CBOW(nwords, config.embedding_size, ntags)

# optim and loss
optimizer = optim.Adam(model.parameters(), lr=config.lr)
loss_fn = torch.nn.NLLLoss() # loss(bow(input), target) and last layer of model is LogSoftmax

In [7]:
# 开始训练
for epoch in range(config.epoch_num):
    random.shuffle(train)
    train_loss = 0.0
    start = time.time()
    for words, tag in train:
        words = Variable(torch.LongTensor(words)).view(1, -1)
        tag = Variable(torch.LongTensor([tag]))
        model.zero_grad()
        my_loss = loss_fn(model(words), tag)
        train_loss += my_loss.data[0]
        my_loss.backward()
        optimizer.step()
    print("iter %r: train loss/sent=%.4f, time=%.2fs" % (epoch, train_loss/len(train), time.time()-start))
    
    # Perform testing
    test_correct = 0.0
    for words, tag in dev:
        words = Variable(torch.LongTensor(words)).view(1, -1)
        scores = model(words).data.numpy()
        predict = np.argmax(scores)
        if predict == tag:
            test_correct += 1
    print("iter %r: test acc=%.4f" % (epoch, test_correct/len(dev)))

iter 0: train loss/sent=1.7765, time=360.31s
iter 0: test acc=0.2964
iter 1: train loss/sent=1.4725, time=543.00s
iter 1: test acc=0.3321
