In [1]:
import torchtext as tt
import collections
import pandas as pd
import torch as T
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def make_vocab(list_sentences):
  # create Vocab object to convert words/tokens to IDs
  # assumes an instantiated global tokenizer exists    
  counter_obj = collections.Counter()
  for sentence in list_sentences:
      sentence = sentence.strip()
      split_and_lowered = g_toker(sentence) #global
      counter_obj.update(split_and_lowered)
  result = tt.vocab.vocab(counter_obj, min_freq=1, specials=('<unk>','<pad>'))
  return result

def make_data_list(DF):
  # get all data into one big list of (label, review) tuples
  # result will be passed to DataLoader, used by collate_fn
  result = []
  for x in range(0,len(DF)):
    tpl = (DF['Sentiment'].iloc[x], DF['Phrase'].iloc[x])  # label, review
    result.append(tpl)
  return result

def collate_data(batch):
  # rearrange a batch and compute offsets too
  # needs a global vocab and tokenizer
  label_lst, review_lst, offset_lst = [], [], [0]
  stoi = g_vocab.get_stoi()
  for (_lbl, _rvw) in batch:
    label_lst.append(int(_lbl))  # string to int
  
    rvw_idxs = [stoi[tok] for tok in g_toker(_rvw)]  # idxs
    rvw_idxs = [g_vocab[tok] for tok in g_toker(_rvw)]  # stoi opt.
    rvw_idxs = T.tensor(rvw_idxs, dtype=T.int64)  # to tensor
    review_lst.append(rvw_idxs)
    offset_lst.append(len(rvw_idxs))

  label_lst = T.tensor(label_lst, dtype=T.int64).to(device) 
  offset_lst = T.tensor(offset_lst[:-1]).cumsum(dim=0).to(device) 
  review_lst = T.cat(review_lst).to(device)  # 2 tensors to 1

  return (label_lst, review_lst, offset_lst)

def train(net, ldr, bs, me, le, lr):
  # network, loader, bat_size, max_epochs, log_every, lrn_rate
  net.train()
  opt = T.optim.SGD(net.parameters(), lr=lr)
  loss_func = T.nn.CrossEntropyLoss()  # will apply softmax
  print("\nStarting training")
  for epoch in range(0, me):
    epoch_loss = 0.0
    for bix, (labels, reviews, offsets) in enumerate(ldr):
      opt.zero_grad()
      oupt = net(reviews, offsets)  # get predictions
      loss_val = loss_func(oupt, labels)  # compute loss
      loss_val.backward()  # compute gradients
      epoch_loss += loss_val.item()  # accum loss for display
      opt.step()  # update net weights
    print("epoch = %4d   loss = %0.4f" % (epoch, epoch_loss))
  print("Done ")

def accuracy(net, meta_lst):
  net.eval()
  ldr = T.utils.data.DataLoader(meta_lst, \
    batch_size=1, shuffle=False, collate_fn=collate_data)
  num_correct = 0; num_wrong = 0
  for bix, (labels, reviews, offsets) in enumerate(ldr):
    with T.no_grad():
      oupt = net(reviews, offsets)  # get prediction values
    pp = T.softmax(oupt, dim=1)  # pseudo-probability
    predicted = T.argmax(pp, dim=1)  # 0 or 1 as tensor
    if labels.item() == predicted.item():
      num_correct += 1
    else:
      num_wrong += 1

  return (num_correct * 1.0) / (num_correct + num_wrong)

In [3]:
class NeuralNet(T.nn.Module):

  def __init__(self):
    super(NeuralNet, self).__init__()
    self.vocab_size = len(g_vocab)
    self.embed_dim = 50
    self.num_class = 5

    self.embed = T.nn.EmbeddingBag(self.vocab_size,
      self.embed_dim)
    self.fc1 = T.nn.Linear(self.embed_dim, 20)
    self.fc2 = T.nn.Linear(20, self.num_class)

    lim = 0.05
    self.embed.weight.data.uniform_(-lim, lim)
    self.fc1.weight.data.uniform_(-lim, lim)
    self.fc1.bias.data.zero_()
    self.fc2.weight.data.uniform_(-lim, lim)
    self.fc2.bias.data.zero_()

  def forward(self, reviews, offsets):
    z = self.embed(reviews, offsets)
    z = T.tanh(self.fc1(z))  # tanh activation
    z = self.fc2(z)  # no activation: CrossEntropyLoss
    return z

In [18]:
device = T.device("cpu")

train_set = pd.read_csv('train.tsv', sep='\t')
#g_ indicate global tokenizer
g_toker = tt.data.utils.get_tokenizer("basic_english")
g_vocab = make_vocab(train_set['Phrase'])

bat_size = 3
data_lst = make_data_list(train_set)
train_ldr = T.utils.data.DataLoader(data_lst, \
  batch_size=bat_size, shuffle=True, \
  collate_fn=collate_data)
net = NeuralNet().to(device)
max_epochs = 5
log_interval = 30
lrn_rate = 0.05
train(net, train_ldr, bat_size, max_epochs, \
    log_interval, lrn_rate)
acc_train = accuracy(net, data_lst)
print("\nAccuracy of model on training data = \
  %0.4f " % acc_train)


Starting training
epoch =    0   loss = 64002.6739
epoch =    1   loss = 54891.0225
epoch =    2   loss = 48743.5561
epoch =    3   loss = 46076.4087
epoch =    4   loss = 44111.1318
Done 


In [16]:
print("New movie review: Overall, I liked the film.")
review_lst = [("-1", "Overall, I liked the film.")] 
ldr = T.utils.data.DataLoader(review_lst, \
  batch_size=1, shuffle=True, collate_fn=collate_data)
net.eval()
(_, review, offset) = iter(ldr).next()
with T.no_grad():
  oupt = net(review, offset)  # get raw prediction values
pp = T.softmax(oupt, dim=1)   # as pseudo-probabilities
print("Sentiment prediction probabilities [neg, pos]: ")
print(pp)

New movie review: Overall, I liked the film.
Sentiment prediction probabilities [neg, pos]: 
tensor([[0.0498, 0.2193, 0.4741, 0.2048, 0.0521]])
