## Assignment 2.4: Text classification via CNN (20 points)

In this assignment you should perform sentiment analysis of the IMDB reviews based on CNN architecture. Read carefully [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/pdf/1408.5882.pdf) by Yoon Kim.

In [1]:
import numpy as np
import torch

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext import datasets, data
from torchtext.data import Field, LabelField
from torchtext.data import Iterator
import torch
torch.manual_seed(42)

<torch._C.Generator at 0x7f805bd3b8f0>

### Preparing Data

In [0]:
TEXT = Field(sequential=True, lower=True, batch_first=True)
LABEL = LabelField(batch_first=True, dtype=torch.float)

In [0]:
train, tst = datasets.IMDB.splits(TEXT, LABEL)
trn, vld = train.split()

In [0]:
# %%time
TEXT.build_vocab(trn)

In [0]:
LABEL.build_vocab(trn)

### Creating the Iterator (2 points)

Define an iterator here

In [0]:
train_iter, val_iter, test_iter = data.BucketIterator.splits(
    (trn, vld, tst),
    batch_size= 64,
    device= torch.device('cuda' if torch.cuda.is_available() else 'cpu')
)

In [7]:
batch = next(train_iter.__iter__()); batch.text, batch.label, batch.label.type()

(tensor([[146,  23, 270,  ...,   1,   1,   1],
         [279, 104, 182,  ...,   1,   1,   1],
         [  9,  62, 101,  ...,   1,   1,   1],
         ...,
         [ 15, 195,  15,  ...,   1,   1,   1],
         [ 10,  20,   7,  ...,   1,   1,   1],
         [ 10,   7,   2,  ...,   1,   1,   1]], device='cuda:0'),
 tensor([0., 0., 0., 1., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0.,
         0., 1., 0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 1.,
         0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 0., 1.,
         1., 0., 1., 0., 1., 1., 0., 1., 1., 1.], device='cuda:0'),
 'torch.cuda.FloatTensor')

### Define CNN-based text classification model (8 points)

In [0]:
class CNN(nn.Module):
    def __init__(self, V, D, kernel_sizes, dropout=0.5):
        super(CNN, self).__init__()
        
        self.embedding = nn.Embedding(
            num_embeddings = V,
            embedding_dim = dim
        )

        self.conv_0 = nn.Conv2d(
            in_channels = 1,
            out_channels = 5,
            kernel_size = (kernel_sizes[0], dim)
        )

        self.conv_1 = nn.Conv2d(
            in_channels = 1,
            out_channels = 5,
            kernel_size = (kernel_sizes[1], dim)
        )

        self.conv_2 = nn.Conv2d(
            in_channels = 1,
            out_channels = 5,
            kernel_size = (kernel_sizes[2], dim)
        )
        self.fc = nn.Linear(len(kernel_sizes) * 5, 1)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
      embedded = self.embedding(x).unsqueeze(1)

      conv_0 = F.relu(self.conv_0(embedded).squeeze(3))
      conv_1 = F.relu(self.conv_1(embedded).squeeze(3))
      conv_2 = F.relu(self.conv_2(embedded).squeeze(3))

      pool_0 = F.max_pool1d(conv_0, conv_0.shape[2]).squeeze(2)
      pool_1 = F.max_pool1d(conv_1, conv_1.shape[2]).squeeze(2)
      pool_2 = F.max_pool1d(conv_2, conv_2.shape[2]).squeeze(2)

      cat = self.dropout(torch.cat((pool_0, pool_1, pool_2), dim=1))
      logit = self.fc(cat)
      return logit

In [0]:
kernel_sizes = [3,4,5]
vocab_size = len(TEXT.vocab)
dropout = 0.5
dim = 300

model = CNN(vocab_size, dim, kernel_sizes, dropout)

In [10]:
model.cuda()

CNN(
  (embedding): Embedding(201912, 300)
  (conv_0): Conv2d(1, 5, kernel_size=(3, 300), stride=(1, 1))
  (conv_1): Conv2d(1, 5, kernel_size=(4, 300), stride=(1, 1))
  (conv_2): Conv2d(1, 5, kernel_size=(5, 300), stride=(1, 1))
  (fc): Linear(in_features=15, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

### The training loop (3 points)

Define the optimization function and the loss functions.

In [0]:
opt = optim.Adam(model.parameters()) 
loss_func = nn.BCEWithLogitsLoss() 

Think carefully about the stopping criteria. 

In [0]:
epochs = 10 # your code goes here

In [13]:
%%time
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() 
    
    for batch in train_iter:  
  
        opt.zero_grad()
        preds = model(batch.text).squeeze(1)

        loss = loss_func(preds, batch.label)
        loss.backward()
        opt.step()
        running_loss += loss.item()
        
    epoch_loss = running_loss / len(trn)
    
    val_loss = 0.0
    model.eval()
    correct = 0
    total = 0 
    for batch in val_iter:

        preds = model(batch.text).squeeze(1)
        loss = loss_func(preds, batch.label)
        val_loss += loss.item()
        
    val_loss /= len(vld)
    
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, epoch_loss, val_loss))

Epoch: 1, Training Loss: 0.010962334643091475, Validation Loss: 0.009320342071851095
Epoch: 2, Training Loss: 0.009587648372990745, Validation Loss: 0.008643218803405761
Epoch: 3, Training Loss: 0.008896584039075034, Validation Loss: 0.008106801501909892
Epoch: 4, Training Loss: 0.008237763113634926, Validation Loss: 0.007815456879138946
Epoch: 5, Training Loss: 0.007501480575970241, Validation Loss: 0.007354682270685832
Epoch: 6, Training Loss: 0.006472509408848626, Validation Loss: 0.006976891314983368
Epoch: 7, Training Loss: 0.005445533606835774, Validation Loss: 0.006978283977508545
Epoch: 8, Training Loss: 0.004376371955871582, Validation Loss: 0.007098943030834198
Epoch: 9, Training Loss: 0.003514788212946483, Validation Loss: 0.007374733328819275
Epoch: 10, Training Loss: 0.0029060147268431527, Validation Loss: 0.0078299218416214
CPU times: user 1min 29s, sys: 37.7 s, total: 2min 7s
Wall time: 2min 7s


### Calculate performance of the trained model (2 points)

In [0]:
predictions = np.array([])
y_true = np.array([])

model.eval()
with torch.no_grad():
    for batch in test_iter:
        preds = model(batch.text).squeeze(1)
        preds = torch.round(torch.sigmoid(preds))
        predictions = np.append(predictions, preds.cpu().data.numpy())
        y_true = np.append(y_true, batch.label.cpu().data.numpy())

In [0]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def metrics_report(true_y, preds_y):
    accuracy = accuracy_score(true_y, preds_y)
    precision = precision_score(true_y, preds_y)
    recall = recall_score(true_y, preds_y)
    f1 = f1_score(true_y, preds_y)
    return accuracy, precision, recall, f1

In [16]:
accuracy, precision, recall, f1 = metrics_report(y_true, predictions)
print('Accuracy  ', accuracy)
print('Precision ', precision)
print('Recall    ', recall)
print('F1        ', f1)

Accuracy   0.78924
Precision  0.8091492090637025
Recall     0.75704
F1         0.7822277330026866


Write down the calculated performance

### Accuracy   0.78924
### Precision  0.8091492090637025
### Recall     0.75704
### F1         0.7822277330026866

In [0]:
del model
torch.cuda.empty_cache()

### Experiments (5 points)

Experiment with the model and achieve better results. Implement and describe your experiments in details, mention what was helpful.

In [0]:
class CNN(nn.Module):
    def __init__(self, V, D, kernel_sizes, dropout=0.5):
        super(CNN, self).__init__()
        
        self.embedding = nn.Embedding(
            num_embeddings = V,
            embedding_dim = dim
        )

        self.conv_0 = nn.Conv2d(
            in_channels = 1,
            out_channels = 5,
            kernel_size = (kernel_sizes[0], dim)
        )

        self.conv_1 = nn.Conv2d(
            in_channels = 1,
            out_channels = 5,
            kernel_size = (kernel_sizes[1], dim)
        )

        self.conv_2 = nn.Conv2d(
            in_channels = 1,
            out_channels = 5,
            kernel_size = (kernel_sizes[2], dim)
        )
        self.conv_3 = nn.Conv2d(
            in_channels = 1,
            out_channels = 5,
            kernel_size = (kernel_sizes[3], dim)
        )
        self.fc = nn.Linear(len(kernel_sizes) * 5, 1)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
      embedded = self.embedding(x).unsqueeze(1)

      conv_0 = F.relu(self.conv_0(embedded).squeeze(3))
      conv_1 = F.relu(self.conv_1(embedded).squeeze(3))
      conv_2 = F.relu(self.conv_2(embedded).squeeze(3))
      conv_3 = F.relu(self.conv_3(embedded).squeeze(3))

      pool_0 = F.max_pool1d(conv_0, conv_0.shape[2]).squeeze(2)
      pool_1 = F.max_pool1d(conv_1, conv_1.shape[2]).squeeze(2)
      pool_2 = F.max_pool1d(conv_2, conv_2.shape[2]).squeeze(2)
      pool_3 = F.max_pool1d(conv_3, conv_3.shape[2]).squeeze(2)

      cat = self.dropout(torch.cat((pool_0, pool_1, pool_2, pool_3), dim=1))
      logit = self.fc(cat)
      return logit

In [0]:
kernel_sizes = [3, 4, 4, 5]
vocab_size = len(TEXT.vocab)
dropout = 0.2
dim = 256

model = CNN(vocab_size, dim, kernel_sizes, dropout).cuda()

In [0]:
opt = optim.Adam(model.parameters(), lr=0.001) 
loss_func = nn.BCEWithLogitsLoss() 

In [0]:
epochs = 5 # your code goes here

In [45]:
%%time
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() 
    
    for batch in train_iter:  
  
        opt.zero_grad()
        preds = model(batch.text).squeeze(1)

        loss = loss_func(preds, batch.label)
        loss.backward()
        opt.step()
        running_loss += loss.item()
        
    epoch_loss = running_loss / len(trn)
    
    val_loss = 0.0
    model.eval()
    correct = 0
    total = 0 
    for batch in val_iter:

        preds = model(batch.text).squeeze(1)
        loss = loss_func(preds, batch.label)
        val_loss += loss.item()
        
    val_loss /= len(vld)
    
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, epoch_loss, val_loss))

Epoch: 1, Training Loss: 0.009676512670516968, Validation Loss: 0.008033390537897745
Epoch: 2, Training Loss: 0.007966688586984362, Validation Loss: 0.007282228211561839
Epoch: 3, Training Loss: 0.006897969649519239, Validation Loss: 0.006823840236663818
Epoch: 4, Training Loss: 0.005906678419453757, Validation Loss: 0.006308831483125687
Epoch: 5, Training Loss: 0.004730303029503141, Validation Loss: 0.006111744662125905
CPU times: user 44.4 s, sys: 17.7 s, total: 1min 2s
Wall time: 1min 2s


In [0]:
predictions = np.array([])
y_true = np.array([])

model.eval()
with torch.no_grad():
    for batch in test_iter:
        preds = model(batch.text).squeeze(1)
        preds = torch.round(torch.sigmoid(preds))
        predictions = np.append(predictions, preds.cpu().data.numpy())
        y_true = np.append(y_true, batch.label.cpu().data.numpy())

In [47]:
accuracy, precision, recall, f1 = metrics_report(y_true, predictions)
print('Accuracy  ', accuracy)
print('Precision ', precision)
print('Recall    ', recall)
print('F1        ', f1)

Accuracy   0.82148
Precision  0.8121649965043114
Recall     0.8364
F1         0.8241043629054506


### 1. Changed kernel size to [3, 4, 4] and hidden dim to 256. Trained for 10 epochs.

Accuracy   0.80488

Precision  0.811152841280209

Recall     0.7948

F1         0.8028931630838857

### 2. Added 1 more conv filter [3, 4, 4, 5] with hidden dim 256. Trained for 5 epochs 

Accuracy   0.82148

Precision  0.8121649965043114

Recall     0.8364

F1         0.8241043629054506