# set up the data

In [1]:
import torchtext
from torchtext.vocab import GloVe

TEXT = torchtext.data.Field(lower=True,fix_length=200,batch_first=True)
LABEL = torchtext.data.Field(sequential=False)
train,valid,test = torchtext.datasets.SST.splits(TEXT,LABEL)
# TEXT and LABEL are objects adapted for the our data(we assume the train set is big enough to include almost all words)
TEXT.build_vocab(train,vectors=GloVe(name='6B',dim=100),max_size=20000,min_freq=10)
LABEL.build_vocab(train)

train_iter, valid_iter, test_iter = torchtext.data.BucketIterator.splits((train,valid,test),batch_size=16)



  from .autonotebook import tqdm as notebook_tqdm


# Demo of the data

In [2]:
for batch in train_iter:
    print(batch.text.size())
    print(batch.label.size())
    break

torch.Size([16, 200])
torch.Size([16])




In [3]:
TEXT.vocab.vectors.size()
# here TEXT acts as a translator between words and their indices, and also provides the pretrained word vectors

torch.Size([1682, 100])

# Trainning

The detailed model design is too complex and it is written in the corresponding py file

I decompose the whole model into 2 parts: encoder and backbone.

I wrap both of the two parts with a classifier and define a class of trainer for grid search.

Based on my previous experience, I choose Adam and upgrade the rnn to lstm. Additionally, I set the dropout to 0.1 to avoid overfitting.


In [3]:
from classifier import classifier
import torch
from torch import nn
from torch.nn import functional as F
from trainer import Trainer


## EXP1: glove VS random

In [5]:
input_size = 100
hidden_size = 512
output_size = 5
dropout = 0.1
encoder = "rnn"
embedding_type = "glove"
attention = False
num_layers = 2

lr=0.0002
bsz=16
epochs=20
train_iter, valid_iter, test_iter = torchtext.data.BucketIterator.splits((train,valid,test),batch_size=bsz)
model = classifier(input_size,hidden_size,num_class=output_size,encoder=encoder,embedding_type=embedding_type,attention=attention,TEXT=TEXT)
model = classifier(input_size,hidden_size,dropout=dropout,num_layers=num_layers, num_class=output_size,encoder=encoder,embedding_type=embedding_type,attention=attention,TEXT=TEXT)
trainer = Trainer(model, TEXT, LABEL, batch_size=bsz, lr=lr, epochs=epochs)
trainer.train(train_iter, valid_iter, test_iter)




Using cuda


  5%|▌         | 1/20 [00:13<04:14, 13.38s/it]

Epoch: 0, Training Loss: 1.4423, Validation Loss: 1.4199, Validation Acc: 0.4896


 50%|█████     | 10/20 [02:11<02:10, 13.10s/it]

Epoch: 10, Training Loss: 1.2090, Validation Loss: 1.2827, Validation Acc: 0.6203


 55%|█████▌    | 11/20 [02:25<02:01, 13.47s/it]

Test Accuracy: 0.6353


100%|██████████| 20/20 [04:22<00:00, 13.15s/it]


In [4]:
input_size = 100
hidden_size = 512
output_size = 5
dropout = 0.1
encoder = "rnn"
embedding_type = "trainable"
attention = False
num_layers = 2

lr=0.0002
bsz=16
epochs=11
train_iter, valid_iter, test_iter = torchtext.data.BucketIterator.splits((train,valid,test),batch_size=bsz)
model = classifier(input_size,hidden_size,num_class=output_size,encoder=encoder,embedding_type=embedding_type,attention=attention,TEXT=TEXT)
model = classifier(input_size,hidden_size,dropout=dropout,num_layers=num_layers, num_class=output_size,encoder=encoder,embedding_type=embedding_type,attention=attention,TEXT=TEXT)
trainer = Trainer(model, TEXT, LABEL, batch_size=bsz, lr=lr, epochs=epochs)
trainer.train(train_iter, valid_iter, test_iter)




Using cuda


  9%|▉         | 1/11 [00:13<02:14, 13.43s/it]

Epoch: 0, Training Loss: 1.4466, Validation Loss: 1.4449, Validation Acc: 0.3887


 91%|█████████ | 10/11 [02:10<00:12, 12.90s/it]

Epoch: 10, Training Loss: 1.3097, Validation Loss: 1.3352, Validation Acc: 0.5677


100%|██████████| 11/11 [02:23<00:00, 13.08s/it]

Test Accuracy: 0.5706





The result shows that the pretrained glove is better. So we will continue to use it later.

## EXP2: RNN(lstm) VS transformer

In [6]:
input_size = 100
hidden_size = 512
output_size = 5
dropout = 0.1
encoder = "transformer"
embedding_type = "glove"
attention = False
num_layers = 2

lr=0.0002
bsz=16
epochs=11
train_iter, valid_iter, test_iter = torchtext.data.BucketIterator.splits((train,valid,test),batch_size=bsz)
model = classifier(input_size,hidden_size,num_class=output_size,encoder=encoder,embedding_type=embedding_type,attention=attention,TEXT=TEXT)
model = classifier(input_size,hidden_size,dropout=dropout,num_layers=num_layers, num_class=output_size,encoder=encoder,embedding_type=embedding_type,attention=attention,TEXT=TEXT)
trainer = Trainer(model, TEXT, LABEL, batch_size=bsz, lr=lr, epochs=epochs)
trainer.train(train_iter, valid_iter, test_iter)


Using cuda


  0%|          | 0/11 [00:00<?, ?it/s]

  9%|▉         | 1/11 [00:04<00:42,  4.22s/it]

Epoch: 0, Training Loss: 1.4432, Validation Loss: 1.4445, Validation Acc: 0.3887


100%|██████████| 11/11 [00:44<00:00,  4.08s/it]

Epoch: 10, Training Loss: 1.3379, Validation Loss: 1.3513, Validation Acc: 0.5386
Test Accuracy: 0.5570





we can directly feel that the transformer is faster!

But it lags behind the rnn. Maybe, two layers restrict transformer's expression ability.

### EXP3: attention layer

In [7]:
input_size = 100
hidden_size = 512
output_size = 5
dropout = 0.1
encoder = "transformer"
embedding_type = "glove"
attention = True
num_layers = 2

lr=0.0002
bsz=16
epochs=11
train_iter, valid_iter, test_iter = torchtext.data.BucketIterator.splits((train,valid,test),batch_size=bsz)
model = classifier(input_size,hidden_size,num_class=output_size,encoder=encoder,embedding_type=embedding_type,attention=attention,TEXT=TEXT)
model = classifier(input_size,hidden_size,dropout=dropout,num_layers=num_layers, num_class=output_size,encoder=encoder,embedding_type=embedding_type,attention=attention,TEXT=TEXT)
trainer = Trainer(model, TEXT, LABEL, batch_size=bsz, lr=lr, epochs=epochs)
trainer.train(train_iter, valid_iter, test_iter)




Using cuda


  9%|▉         | 1/11 [00:04<00:49,  4.98s/it]

Epoch: 0, Training Loss: 1.4428, Validation Loss: 1.4442, Validation Acc: 0.3887


 91%|█████████ | 10/11 [00:48<00:04,  4.82s/it]

Epoch: 10, Training Loss: 1.3187, Validation Loss: 1.3442, Validation Acc: 0.5559


100%|██████████| 11/11 [00:53<00:00,  4.86s/it]

Test Accuracy: 0.5516





In [8]:
input_size = 100
hidden_size = 512
output_size = 5
dropout = 0.1
encoder = "rnn"
embedding_type = "glove"
attention = True
num_layers = 2

lr=0.0002
bsz=16
epochs=11
train_iter, valid_iter, test_iter = torchtext.data.BucketIterator.splits((train,valid,test),batch_size=bsz)
model = classifier(input_size,hidden_size,num_class=output_size,encoder=encoder,embedding_type=embedding_type,attention=attention,TEXT=TEXT)
model = classifier(input_size,hidden_size,dropout=dropout,num_layers=num_layers, num_class=output_size,encoder=encoder,embedding_type=embedding_type,attention=attention,TEXT=TEXT)
trainer = Trainer(model, TEXT, LABEL, batch_size=bsz, lr=lr, epochs=epochs)
trainer.train(train_iter, valid_iter, test_iter)


Using cuda


  9%|▉         | 1/11 [00:14<02:27, 14.73s/it]

Epoch: 0, Training Loss: 1.4622, Validation Loss: 1.5014, Validation Acc: 0.4033


 91%|█████████ | 10/11 [02:21<00:14, 14.19s/it]

Epoch: 10, Training Loss: 1.1877, Validation Loss: 1.3021, Validation Acc: 0.5958


100%|██████████| 11/11 [02:37<00:00, 14.28s/it]

Test Accuracy: 0.6208





The lr and batchsize has been searched in the py file trainer.py. 

Due to its complexity, I don't show it here.

# Result
The best setting for such a task is rnn with glove pretrained embedding, no attention.

It reaches 63% in test.