## Classification baseline for Stanford Sentiment Treebank (SST)

In [1]:
import torch

from torchtext import data
from torchtext import datasets 
import sys
import seaborn as sns
from sklearn.metrics import roc_curve, auc

sys.path.insert(0, '../../Utils/')

import matplotlib.pyplot as plt
%matplotlib inline  

import models
from train import *
from metrics import * 

print("Python: %s" % sys.version)
print("Pytorch: %s" % torch.__version__)

Python: 3.7.0 (default, Jun 28 2018, 13:15:42) 
[GCC 7.2.0]
Pytorch: 1.0.0


### Load SST using Torchtext

In [2]:
# To fix the following error: OSError: [E050] Can't find model 'en'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.
# Run: 
# python -m spacy download en


TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(tensor_type=torch.LongTensor)

train, val, test = datasets.SST.splits(TEXT, LABEL, root='../../Datasets/SST_data', fine_grained=True)


TEXT.build_vocab(train, max_size=25000, vectors="glove.6B.100d", vectors_cache='../../Datasets/SST_data/vector_cache')
LABEL.build_vocab(train)

BATCH_SIZE = 32

train_itr, val_itr, test_itr = data.BucketIterator.splits(
    (train, val, test), 
    batch_size = BATCH_SIZE, 
    sort_key= lambda x: len(x.text), 
    repeat=False
)


### Create bidirectional LSTM model

In [3]:
vocab_size = len(TEXT.vocab)
embedding_size = 100
hidden_size = 256
output_size = 5


RNN_model = models.RNN(vocab_size, embedding_size, hidden_size, output_size)

pretrained_embeddings = TEXT.vocab.vectors
RNN_model.embedding.weight.data.copy_(pretrained_embeddings)
print("")


optimizer = torch.optim.Adam(RNN_model.parameters())
criterion = torch.nn.CrossEntropyLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

RNN_model = RNN_model.to(device)
criterion = criterion.to(device)




### Utility functions

In [4]:
def classification_accuracy(preds, y):

    correct = (preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

def binary_accuracy(preds, y):

    rounded_preds = torch.round(preds)

    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

### Train function

In [5]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)

        acc = classification_accuracy(predictions.argmax(dim=1), batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

### Evaluation function

In [6]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = classification_accuracy(predictions.argmax(dim=1), batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

### Train classification model

In [7]:
n_epochs = 20

for epoch in range(n_epochs):

    train_loss, train_acc = train(RNN_model, train_itr, optimizer, criterion)
    valid_loss, valid_acc = evaluate(RNN_model, val_itr, criterion)
    
    print('Epoch: %02d, Train Loss: %.3f, Train Acc: %.2f%%, Val. Loss: %.3f, Val. Acc: %.2f%%' % (epoch+1, train_loss, train_acc*100, valid_loss, valid_acc*100))

  return Variable(arr, volatile=not train)


Epoch: 01, Train Loss: 1.514, Train Acc: 31.87%, Val. Loss: 1.387, Val. Acc: 39.22%
Epoch: 02, Train Loss: 1.368, Train Acc: 39.68%, Val. Loss: 1.362, Val. Acc: 39.36%
Epoch: 03, Train Loss: 1.272, Train Acc: 43.81%, Val. Loss: 1.362, Val. Acc: 38.16%
Epoch: 04, Train Loss: 1.173, Train Acc: 48.30%, Val. Loss: 1.334, Val. Acc: 39.90%
Epoch: 05, Train Loss: 1.073, Train Acc: 53.16%, Val. Loss: 1.417, Val. Acc: 42.22%
Epoch: 06, Train Loss: 0.982, Train Acc: 58.08%, Val. Loss: 1.394, Val. Acc: 40.80%
Epoch: 07, Train Loss: 0.891, Train Acc: 62.64%, Val. Loss: 1.480, Val. Acc: 41.24%
Epoch: 08, Train Loss: 0.805, Train Acc: 67.31%, Val. Loss: 1.611, Val. Acc: 40.03%
Epoch: 09, Train Loss: 0.723, Train Acc: 71.15%, Val. Loss: 1.709, Val. Acc: 39.27%
Epoch: 10, Train Loss: 0.661, Train Acc: 73.13%, Val. Loss: 1.826, Val. Acc: 40.12%
Epoch: 11, Train Loss: 0.606, Train Acc: 75.96%, Val. Loss: 1.836, Val. Acc: 40.61%
Epoch: 12, Train Loss: 0.554, Train Acc: 77.66%, Val. Loss: 1.943, Val. Acc:

### Evaluate model on test set

In [8]:
test_loss, test_acc = evaluate(RNN_model, test_itr, criterion)

print('RNN test accuracy: %.2f' % (test_acc))

RNN test accuracy: 0.41
