*Kirill Semenov, bkl162*

# Assignment 5

Build CNN model for sentiment analysis (binary classification) of IMDB Reviews (https://www.kaggle.com/utathya/imdb-review-dataset).
You can use data with label="unsup" for pretraining of embeddings. Here you are forbidden to use test dataset for pretraining of embeddings.  
Your quality metric is accuracy score on test dataset. Look at "type" column for  train/test split.  
You can use pretrained embeddings from external sources.  
You have to provide data for trials with different hyperparameter values.  

You have to beat following baselines:  
[3 points] acc = 0.75  
[5 points] acc = 0.8  
[8 points] acc = 0.9  

[2 points] for using unsupervised data  

## Installing libs

In [0]:
import torch
from torchtext import data, datasets
import random
import numpy as np

import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim

SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True



## downloading dataset (analogous to that in kaggle, but from torchtext.datasets)

In [0]:
TEXT = data.Field(tokenize = 'spacy', batch_first = True)
LABEL = data.LabelField(dtype = torch.float)

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

## Downloading pretrained vectors

In [0]:
MAX_VOCAB_SIZE = 30000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

In [0]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

## Making model

In [0]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.conv_layer_0 = nn.Conv2d(in_channels = 1, out_channels = n_filters, kernel_size = (filter_sizes[0], embedding_dim))
        
        self.conv_layer_1 = nn.Conv2d(in_channels = 1, out_channels = n_filters, kernel_size = (filter_sizes[1], embedding_dim))
        
        self.conv_layer_2 = nn.Conv2d(in_channels = 1, out_channels = n_filters, kernel_size = (filter_sizes[2], embedding_dim))
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)

        
    def forward(self, text):
                        
        embedded = self.embedding(text)        
        embedded = embedded.unsqueeze(1)
                
        convolled_0 = F.relu(self.conv_layer_0(embedded).squeeze(3))
        convolled_1 = F.relu(self.conv_layer_1(embedded).squeeze(3))
        convolled_2 = F.relu(self.conv_layer_2(embedded).squeeze(3))
                    
        pooled_0 = F.max_pool1d(convolled_0, convolled_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(convolled_1, convolled_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(convolled_2, convolled_2.shape[2]).squeeze(2)
                
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim = 1))
            
        return self.fc(cat)

## Trying hyperparams

In [0]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.25
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [0]:

pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 1.9269,  1.4873,  0.9007,  ...,  0.1233,  0.3499,  0.6173],
        [ 0.7262,  0.0912, -0.3891,  ...,  0.0821,  0.4440, -0.7240],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-2.2970,  1.1119,  0.7589,  ...,  0.1428, -0.0703, -1.2786],
        [-0.5917, -0.0553,  0.9989,  ..., -0.3378, -0.2986, -1.0236],
        [-0.4220, -1.6393,  0.1731,  ..., -0.6083, -0.4398,  1.9571]])

In [0]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

## Making metrics

In [0]:
optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [0]:
def bin_acc(preds, y): #binary_accuracy

    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()

    correct_sum = correct.sum()
    n_correct = len(correct)

    acc = correct_sum / n_correct

    return acc

## Making `train` and `evaluation` functions

In [0]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss, epoch_acc = 0, 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = bin_acc(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, iterator, criterion):
    
    epoch_loss, epoch_acc = 0, 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = bin_acc(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

## Learning

In [0]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    

    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut4-model.pt')
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01
	Train Loss: 0.606 | Train Acc: 65.93%
	 Val. Loss: 0.434 |  Val. Acc: 80.91%
Epoch: 02
	Train Loss: 0.358 | Train Acc: 84.34%
	 Val. Loss: 0.328 |  Val. Acc: 85.73%
Epoch: 03
	Train Loss: 0.232 | Train Acc: 90.90%
	 Val. Loss: 0.297 |  Val. Acc: 87.09%
Epoch: 04
	Train Loss: 0.151 | Train Acc: 94.74%
	 Val. Loss: 0.299 |  Val. Acc: 87.32%
Epoch: 05
	Train Loss: 0.089 | Train Acc: 97.17%
	 Val. Loss: 0.322 |  Val. Acc: 87.11%


## Testing

In [0]:
model.load_state_dict(torch.load('tut4-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.365 | Test Acc: 85.75%
