# Kaleb playing around with embeddings...
keitorch.py is a modified version of torchnlp ---
*which by the way not cool it's named the same as an actual torch library. took me forever to figure that out*

In [1]:
import torch
import torchtext
import numpy as np
from torchnlp import *
from keitorch import *

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
dir = 'data/datasets/YelpReviewFull'
train_dataset, test_dataset, classes, vocab = load_dataset_from_csvs(dir=dir)
vocab_size = len(vocab)
print("Vocab size = ",vocab_size)

Loading dataset...
Building vocab...
Vocab size =  519818


In [3]:
train_dataset[0]

[2,
 "Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have office workers, you have patients with medical needs, why isn't anyone answering the phone?  It's incomprehensible and not work the aggravation.  It's with regret that I feel that I have to give Dr. Goldberg 2 stars."]

## Using both Regular Emed Classifier and Embedding Bag Classifier

In [4]:
class EmbedClassifier(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embed_dim)
        self.fc = torch.nn.Linear(embed_dim, num_class)

    def forward(self, x):
        x = self.embedding(x)
        x = torch.mean(x,dim=1)
        return self.fc(x)

class EmbedBagClassifier(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = torch.nn.EmbeddingBag(vocab_size, embed_dim)
        self.fc = torch.nn.Linear(embed_dim, num_class)

    def forward(self, text, off):
        x = self.embedding(text, off)
        return self.fc(x)

In [5]:
# Original file does not work as it doesn't use the correct vocab variable

def padify(b,voc=vocab,tokenizer=tokenizer):
    # b is the list of tuples of length batch_size
    #   - first element of a tuple = label, 
    #   - second = feature (text sequence)
    # build vectorized sequence
    v = [encode(x[1],voc=voc,tokenizer=tokenizer) for x in b]
    # compute max length of a sequence in this minibatch
    l = max(map(len,v))
    return ( # tuple of two tensors - labels and features
        torch.LongTensor([t[0]-1 for t in b]),
        torch.stack([torch.nn.functional.pad(torch.tensor(t),(0,l-len(t)),mode='constant',value=0) for t in v])
    )

def offsetify(b, voc=vocab):
    # first, compute data tensor from all sequences
    x = [torch.tensor(encode(t[1], voc=voc)) for t in b]
    # now, compute the offsets by accumulating the tensor of sequence lengths
    o = [0] + [len(t) for t in x]
    o = torch.tensor(o[:-1]).cumsum(dim=0)
    return ( 
        torch.LongTensor([t[0]-1 for t in b]), # labels
        torch.cat(x), # text 
        o
    )


In [6]:
# For regular
def train_epoch(net,dataloader,vocab, lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200):
    optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)
    loss_fn = loss_fn.to(device)
    net.train()
    total_loss,acc,count,i = 0,0,0,0
    for labels,features in dataloader:
        optimizer.zero_grad()
        # features, labels = torch.tensor(features), torch.tensor(labels)
        features, labels = features.to(device), labels.to(device)
        out = net(features)
        loss = loss_fn(out,labels) #cross_entropy(out,labels)
        loss.backward()
        optimizer.step()
        total_loss+=loss
        _,predicted = torch.max(out,1)
        acc+=(predicted==labels).sum()
        count+=len(labels)
        i+=1
        if i%report_freq==0:
            print(f"{count}: acc={acc.item()/count}")
        if epoch_size and count>epoch_size:
            break
    return total_loss.item()/count, acc.item()/count

# For bag
def train_epoch_emb(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200):
    optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)
    loss_fn = loss_fn.to(device)
    net.train()
    total_loss,acc,count,i = 0,0,0,0
    for labels,text,off in dataloader:
        optimizer.zero_grad()
        labels,text,off = labels.to(device), text.to(device), off.to(device)
        out = net(text, off)
        loss = loss_fn(out,labels) #cross_entropy(out,labels)
        loss.backward()
        optimizer.step()
        total_loss+=loss
        _,predicted = torch.max(out,1)
        acc+=(predicted==labels).sum()
        count+=len(labels)
        i+=1
        if i%report_freq==0:
            print(f"{count}: acc={acc.item()/count}")
        if epoch_size and count>epoch_size:
            break
    return total_loss.item()/count, acc.item()/count

In [8]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, collate_fn=padify, shuffle=True)

print("Starting regular embed classifier")
net = EmbedClassifier(vocab_size,32,len(classes)).to(device)
train_epoch(net,train_loader,vocab, lr=1, epoch_size=25000)


print("Starting bag embed classifier")
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, collate_fn=offsetify, shuffle=True)
bagnet = EmbedBagClassifier(vocab_size,32,len(classes)).to(device)
train_epoch_emb(bagnet,train_loader, lr=4, epoch_size=25000)

Starting regular embed classifier
3200: acc=0.299375
6400: acc=0.31234375
9600: acc=0.3285416666666667
12800: acc=0.34484375
16000: acc=0.352125
19200: acc=0.359375
22400: acc=0.3659375
Starting bag embed classifier
3200: acc=0.3203125
6400: acc=0.3334375
9600: acc=0.346875
12800: acc=0.358984375
16000: acc=0.365375
19200: acc=0.369375
22400: acc=0.3741964285714286


(26.26945877319258, 0.37703934740882916)

In [29]:
print("Regular testing")

single_instance = test_dataset[0]

# Wrap it in a list to create a batch of size one
single_batch = [single_instance]

# Apply padify to the single batch
labels, text = padify(single_batch)

# Move tensors to the device
labels, text = labels.to(device), text.to(device)

# Pass through the model
output = net(text)
_,predicted = torch.max(output,1)

print("Test 1")
print(f"Review text: {test_dataset[0][1]}.\n Actual Review: {test_dataset[0][0]}, Predicted: {predicted[0]+1}")

single_instance = test_dataset[14]

# Wrap it in a list to create a batch of size one
single_batch = [single_instance]

# Apply padify to the single batch
labels, text = padify(single_batch)

# Move tensors to the device
labels, text = labels.to(device), text.to(device)

# Pass through the model
output = net(text)
_,predicted = torch.max(output,1)

print("Test 2")
print(f"Review text: {test_dataset[14][1]}.\n Actual Review: {test_dataset[14][0]}, Predicted: {predicted[0]+1}")

Regular testing
Test 1
Review text: Don't waste your time.  We had two different people come to our house to give us estimates for a deck (one of them the OWNER).  Both times, we never heard from them.  Not a call, not the estimate, nothing..
 Actual Review: 1, Predicted: 1
Test 2
Review text: Wast there last Friday. Seats right in front if the stage. The show was good. The headliner, while a bit long, was good. Fantastic service from our waitresses. Will definitely go back..
 Actual Review: 4, Predicted: 4


In [25]:
print("Bagnet testing")

single_instance = test_dataset[0]

# Wrap it in a list to create a batch of size one
single_batch = [single_instance]

# Apply offsetify to the single batch
labels, text, offsets = offsetify(single_batch)

# Move tensors to the device
labels, text, offsets = labels.to(device), text.to(device), offsets.to(device)

# Pass through the model
output = bagnet(text, offsets)
_,predicted = torch.max(output,1)

print("Test 1")
print(f"Review text: {test_dataset[0][1]}.\n Actual Review: {test_dataset[0][0]}, Predicted: {predicted[0]+1}")

single_instance = test_dataset[14]

# Wrap it in a list to create a batch of size one
single_batch = [single_instance]

# Apply offsetify to the single batch
labels, text, offsets = offsetify(single_batch)

# Move tensors to the device
labels, text, offsets = labels.to(device), text.to(device), offsets.to(device)

# Pass through the model
output = bagnet(text, offsets)
_,predicted = torch.max(output,1)

print("Test 2")
print(f"Review text: {test_dataset[14][1]}.\n Actual Review: {test_dataset[14][0]}, Predicted: {predicted[0]+1}")

Bagnet testing
Test 1
Review text: Don't waste your time.  We had two different people come to our house to give us estimates for a deck (one of them the OWNER).  Both times, we never heard from them.  Not a call, not the estimate, nothing..
 Actual Review: 1, Predicted: 1
Test 2
Review text: Wast there last Friday. Seats right in front if the stage. The show was good. The headliner, while a bit long, was good. Fantastic service from our waitresses. Will definitely go back..
 Actual Review: 4, Predicted: 3
