# set up the data

In [1]:
import torchtext
from torchtext.vocab import GloVe

TEXT = torchtext.data.Field(lower=True,fix_length=200,batch_first=True)
LABEL = torchtext.data.Field(sequential=False)
train,valid,test = torchtext.datasets.SST.splits(TEXT,LABEL, fine_grained=True)
# TEXT and LABEL are objects adapted for the our data(we assume the train set is big enough to include almost all words)
TEXT.build_vocab(train,vectors=GloVe(name='6B',dim=100),max_size=20000,min_freq=10)
LABEL.build_vocab(train)

train_iter, valid_iter, test_iter = torchtext.data.BucketIterator.splits((train,valid,test),batch_size=16)



  from .autonotebook import tqdm as notebook_tqdm


# Demo of the data

In [2]:
for batch in train_iter:
    print(batch.text.size())
    print(batch.label.size())
    print(batch.label)
    break

torch.Size([16, 200])
torch.Size([16])
tensor([2, 3, 3, 1, 1, 1, 2, 2, 2, 3, 4, 1, 1, 1, 3, 3])




In [3]:
TEXT.vocab.vectors.size()
# here TEXT acts as a translator between words and their indices, and also provides the pretrained word vectors

torch.Size([1682, 100])

# Trainning

The detailed model design is too complex and it is written in the corresponding py file

I decompose the whole model into 2 parts: encoder and backbone.

I wrap both of the two parts with a classifier and define a class of trainer for grid search.

Based on my previous experience, I choose Adam and upgrade the rnn to lstm. Additionally, I set the dropout to 0.1 to avoid overfitting.


In [2]:
import torch.backends
from classifier import classifier
import torch
from torch import nn
from torch.nn import functional as F
from trainer import Trainer
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.enabled = True


## EXP1: glove VS random

In [5]:
input_size = 100
hidden_size = 512
output_size = 6 # due to the fine_grained=True, the tag is from 1-5, so the output_size is 6
dropout = 0.1
encoder = "rnn"
embedding_type = "glove"
attention = False
num_layers = 2

lr=0.0002
bsz=16
epochs=21
train_iter, valid_iter, test_iter = torchtext.data.BucketIterator.splits((train,valid,test),batch_size=bsz)
model = classifier(input_size,hidden_size,num_class=output_size,encoder=encoder,embedding_type=embedding_type,attention=attention,TEXT=TEXT)
model = classifier(input_size,hidden_size,dropout=dropout,num_layers=num_layers, num_class=output_size,encoder=encoder,embedding_type=embedding_type,attention=attention,TEXT=TEXT)
trainer = Trainer(model, TEXT, LABEL, batch_size=bsz, lr=lr, epochs=epochs)
trainer.train(train_iter, valid_iter, test_iter)




Using cuda


  5%|▍         | 1/21 [00:13<04:31, 13.56s/it]

Epoch: 0, Training Loss: 1.7285, Validation Loss: 1.7336, Validation Acc: 0.2534


 48%|████▊     | 10/21 [02:11<02:24, 13.11s/it]

Epoch: 10, Training Loss: 1.5990, Validation Loss: 1.6627, Validation Acc: 0.3678


 52%|█████▏    | 11/21 [02:25<02:14, 13.46s/it]

Test Accuracy: 0.3624


 95%|█████████▌| 20/21 [04:22<00:13, 13.04s/it]

Epoch: 20, Training Loss: 1.5217, Validation Loss: 1.6608, Validation Acc: 0.3833


100%|██████████| 21/21 [04:36<00:00, 13.18s/it]

Test Accuracy: 0.3670





In [6]:
input_size = 100
hidden_size = 512
output_size = 6
dropout = 0.1
encoder = "rnn"
embedding_type = "trainable"
attention = False
num_layers = 2

lr=0.0002
bsz=16
epochs=21
train_iter, valid_iter, test_iter = torchtext.data.BucketIterator.splits((train,valid,test),batch_size=bsz)
model = classifier(input_size,hidden_size,num_class=output_size,encoder=encoder,embedding_type=embedding_type,attention=attention,TEXT=TEXT)
model = classifier(input_size,hidden_size,dropout=dropout,num_layers=num_layers, num_class=output_size,encoder=encoder,embedding_type=embedding_type,attention=attention,TEXT=TEXT)
trainer = Trainer(model, TEXT, LABEL, batch_size=bsz, lr=lr, epochs=epochs)
trainer.train(train_iter, valid_iter, test_iter)


Using cuda


  5%|▍         | 1/21 [00:13<04:28, 13.44s/it]

Epoch: 0, Training Loss: 1.7284, Validation Loss: 1.7337, Validation Acc: 0.2534


 48%|████▊     | 10/21 [02:11<02:25, 13.20s/it]

Epoch: 10, Training Loss: 1.6753, Validation Loss: 1.7020, Validation Acc: 0.3224


 52%|█████▏    | 11/21 [02:26<02:15, 13.57s/it]

Test Accuracy: 0.3321


 95%|█████████▌| 20/21 [04:26<00:13, 13.35s/it]

Epoch: 20, Training Loss: 1.6116, Validation Loss: 1.6898, Validation Acc: 0.3342


100%|██████████| 21/21 [04:40<00:00, 13.36s/it]

Test Accuracy: 0.3511





The result shows that the pretrained glove is better. So we will continue to use it later.

## EXP2: RNN(lstm) VS transformer

In [7]:
input_size = 100
hidden_size = 512
output_size = 6
dropout = 0.1
encoder = "transformer"
embedding_type = "glove"
attention = False
num_layers = 2

lr=0.0002
bsz=16
epochs=21
train_iter, valid_iter, test_iter = torchtext.data.BucketIterator.splits((train,valid,test),batch_size=bsz)
model = classifier(input_size,hidden_size,num_class=output_size,encoder=encoder,embedding_type=embedding_type,attention=attention,TEXT=TEXT)
model = classifier(input_size,hidden_size,dropout=dropout,num_layers=num_layers, num_class=output_size,encoder=encoder,embedding_type=embedding_type,attention=attention,TEXT=TEXT)
trainer = Trainer(model, TEXT, LABEL, batch_size=bsz, lr=lr, epochs=epochs)
trainer.train(train_iter, valid_iter, test_iter)


Using cuda


  5%|▍         | 1/21 [00:04<01:28,  4.41s/it]

Epoch: 0, Training Loss: 1.7272, Validation Loss: 1.7350, Validation Acc: 0.2534


 52%|█████▏    | 11/21 [00:47<00:43,  4.31s/it]

Epoch: 10, Training Loss: 1.7078, Validation Loss: 1.7377, Validation Acc: 0.2770


100%|██████████| 21/21 [01:30<00:00,  4.29s/it]

Epoch: 20, Training Loss: 1.6745, Validation Loss: 1.6967, Validation Acc: 0.3261
Test Accuracy: 0.3127





we can directly feel that the transformer is faster!

But it lags behind the rnn. Maybe, two layers restrict transformer's expression ability.

### EXP3: attention layer

In [8]:
input_size = 100
hidden_size = 512
output_size = 6
dropout = 0.1
encoder = "transformer"
embedding_type = "glove"
attention = True
num_layers = 2

lr=0.0002
bsz=16
epochs=21
train_iter, valid_iter, test_iter = torchtext.data.BucketIterator.splits((train,valid,test),batch_size=bsz)
model = classifier(input_size,hidden_size,num_class=output_size,encoder=encoder,embedding_type=embedding_type,attention=attention,TEXT=TEXT)
model = classifier(input_size,hidden_size,dropout=dropout,num_layers=num_layers, num_class=output_size,encoder=encoder,embedding_type=embedding_type,attention=attention,TEXT=TEXT)
trainer = Trainer(model, TEXT, LABEL, batch_size=bsz, lr=lr, epochs=epochs)
trainer.train(train_iter, valid_iter, test_iter)


Using cuda


  0%|          | 0/21 [00:00<?, ?it/s]

  5%|▍         | 1/21 [00:04<01:34,  4.72s/it]

Epoch: 0, Training Loss: 1.7278, Validation Loss: 1.7323, Validation Acc: 0.2534


 48%|████▊     | 10/21 [00:46<00:50,  4.59s/it]

Epoch: 10, Training Loss: 1.6918, Validation Loss: 1.6978, Validation Acc: 0.3333


 52%|█████▏    | 11/21 [00:51<00:47,  4.77s/it]

Test Accuracy: 0.3299


 95%|█████████▌| 20/21 [01:32<00:04,  4.62s/it]

Epoch: 20, Training Loss: 1.6601, Validation Loss: 1.6746, Validation Acc: 0.3560


100%|██████████| 21/21 [01:37<00:00,  4.66s/it]

Test Accuracy: 0.3403





In [9]:
input_size = 100
hidden_size = 512
output_size = 6
dropout = 0.1
encoder = "rnn"
embedding_type = "glove"
attention = True
num_layers = 2

lr=0.0002
bsz=16
epochs=21
train_iter, valid_iter, test_iter = torchtext.data.BucketIterator.splits((train,valid,test),batch_size=bsz)
model = classifier(input_size,hidden_size,num_class=output_size,encoder=encoder,embedding_type=embedding_type,attention=attention,TEXT=TEXT)
model = classifier(input_size,hidden_size,dropout=dropout,num_layers=num_layers, num_class=output_size,encoder=encoder,embedding_type=embedding_type,attention=attention,TEXT=TEXT)
trainer = Trainer(model, TEXT, LABEL, batch_size=bsz, lr=lr, epochs=epochs)
trainer.train(train_iter, valid_iter, test_iter)


Using cuda


  5%|▍         | 1/21 [00:14<04:54, 14.73s/it]

Epoch: 0, Training Loss: 1.7297, Validation Loss: 1.7313, Validation Acc: 0.2534


 48%|████▊     | 10/21 [02:22<02:35, 14.14s/it]

Epoch: 10, Training Loss: 1.5414, Validation Loss: 1.6432, Validation Acc: 0.3942


 52%|█████▏    | 11/21 [02:37<02:25, 14.56s/it]

Test Accuracy: 0.3593


 95%|█████████▌| 20/21 [04:46<00:14, 14.27s/it]

Epoch: 20, Training Loss: 1.4755, Validation Loss: 1.6498, Validation Acc: 0.3851


100%|██████████| 21/21 [05:01<00:00, 14.36s/it]

Test Accuracy: 0.3584





The lr and batchsize has been searched in the py file trainer.py. 

Due to its complexity, I don't show it here.

### Sota

Previous exps aim to indicate the hyperparameters' influence on the models. For better performance and application, I adapt gru to substitute for the lstm for fast speed and do the data augmentation.

For the data augmentation, I shorten the fix_length to 50 as I find that the max length of the sentence is 32 so we can reduce the padding info.

Additionally, I adapt large pretrained embedding to bring my model to the next level.

In [3]:
import torch.backends
from classifier import classifier
import torch
from torch import nn
from torch.nn import functional as F
from trainer import Trainer
import torchtext
from torchtext.vocab import GloVe
# set the random seed for reproducibility
torch.manual_seed(0)
TEXT = torchtext.data.Field(lower=True,fix_length=50,batch_first=True)
LABEL = torchtext.data.Field(sequential=False)
train,valid,test = torchtext.datasets.SST.splits(TEXT,LABEL, fine_grained=True)
# TEXT and LABEL are objects adapted for the our data(we assume the train set is big enough to include almost all words)
TEXT.build_vocab(train,vectors=GloVe(name='6B',dim=300),max_size=20000,min_freq=10)
LABEL.build_vocab(train)

#train_iter, valid_iter, test_iter = torchtext.data.BucketIterator.splits((train,valid,test),batch_size=16)

input_size = 300
hidden_size = 512
output_size = 5
dropout = 0.1
encoder = "gru"
embedding_type = "glove"
attention = True
num_layers = 2

lr=0.0001
bsz=16
epochs=20
train_iter, valid_iter, test_iter = torchtext.data.BucketIterator.splits((train,valid,test),batch_size=bsz)
model = classifier(input_size,hidden_size,num_class=output_size,encoder=encoder,embedding_type=embedding_type,attention=attention,TEXT=TEXT)
model = classifier(input_size,hidden_size,dropout=dropout,num_layers=num_layers, num_class=output_size,encoder=encoder,embedding_type=embedding_type,attention=attention,TEXT=TEXT)
trainer = Trainer(model, TEXT, LABEL, batch_size=bsz, lr=lr, epochs=epochs)
trainer.train(train_iter, valid_iter, test_iter)


Using cuda


  5%|▌         | 1/20 [00:04<01:18,  4.11s/it]

Epoch: 0, Training Loss: 1.5782, Validation Loss: 1.5797, Validation Acc: 0.2625


 20%|██        | 4/20 [00:15<01:03,  3.97s/it]

Epoch: 4, Training Loss: 1.4205, Validation Loss: 1.4913, Validation Acc: 0.3960


 25%|██▌       | 5/20 [00:20<01:01,  4.11s/it]

Test Accuracy: 0.4023


 40%|████      | 8/20 [00:32<00:48,  4.01s/it]

Epoch: 8, Training Loss: 1.4085, Validation Loss: 1.5019, Validation Acc: 0.3860


 45%|████▌     | 9/20 [00:36<00:45,  4.12s/it]

Test Accuracy: 0.3810


 60%|██████    | 12/20 [00:48<00:32,  4.02s/it]

Epoch: 12, Training Loss: 1.3592, Validation Loss: 1.5082, Validation Acc: 0.3806


 65%|██████▌   | 13/20 [00:52<00:28,  4.12s/it]

Test Accuracy: 0.3869


 80%|████████  | 16/20 [01:04<00:16,  4.03s/it]

Epoch: 16, Training Loss: 1.3010, Validation Loss: 1.5079, Validation Acc: 0.3869


 85%|████████▌ | 17/20 [01:09<00:12,  4.15s/it]

Test Accuracy: 0.4163


100%|██████████| 20/20 [01:21<00:00,  4.06s/it]


# Result
The best setting for such a task is rnn(gru) with glove pretrained embedding, no attention.

It reaches 41.63% in test.