### Important Note: Make sure to Restart and Run all (Kernel -> Restart and Run all) every time you modify your network before training it: Jupyter Notebook saves network weight and resumes training instead of starting it from scratch again

In [None]:
# First lets improve libraries that we are going to be used in this lab session
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from collections import Counter
import pickle as pkl
import random
import pdb
random.seed(134)
from wordfreq import top_n_list
num_words = 50000

BATCH_SIZE = 2

In [None]:
PAD_IDX = 0
UNK_IDX = 1

words_to_load = 50000

with open('wiki-news-300d-1M.vec') as f:
    loaded_embeddings_ft = np.zeros((words_to_load, 300))
    words_ft = {}
    idx2words_ft = {}
    ordered_words_ft = []
    for i, line in enumerate(f):
        if i+2 >= words_to_load: 
            break
        s = line.split()
        loaded_embeddings_ft[i+2, :] = np.asarray(s[1:])
        words_ft[s[0]] = i + 2
        idx2words_ft[i + 2] = s[0]
        ordered_words_ft.append(s[0])
# loaded_embeddings_ft is a list of lists, mapping each word index to its corresponding vector
# words_ft: dict, maps each word to its corresponding index, including that os <unk> and <pad>
# idx2words_ft: dict, maps each index to the word

    idx2words_ft[PAD_IDX] = '<pad>'
    idx2words_ft[UNK_IDX] = '<unk>'
    words_ft['<pad>'] = PAD_IDX
    words_ft['<unk>'] = UNK_IDX

In [None]:
loaded_embeddings_ft[1,:] = np.random.normal(size = (300,))

In [None]:
# Check if loading was correct
# print(type(loaded_embeddings_ft))
# print (UNK_IDX)
# print(loaded_embeddings_ft[words_ft['potato']])
print(loaded_embeddings_ft[words_ft['<unk>']])

In [None]:
import pandas as pd
snli_train = pd.read_csv("hw2_data/snli_train.tsv", sep='\t')
snli_val = pd.read_csv("hw2_data/snli_val.tsv", sep='\t')

train_targets = snli_train.label
val_targets = snli_val.label
# Check structure of the dataframe
snli_train.head()

### Now lets build the PyTorch DataLoader as we did in previous lab

In [9]:
# convert token to id in the dataset
def token2index_dataset(tokens_data):
    indices_data = []
    for i in range(len(tokens_data)):
        temp = []
        for tokens in tokens_data[i].split():
            if tokens in words_ft:
                temp.append(words_ft[tokens])
            else:
                temp.append(UNK_IDX)
                        
        indices_data.append(temp)
    return indices_data

train_prem_data_indices = token2index_dataset(snli_train.sentence1)
train_hypo_data_indices = token2index_dataset(snli_train.sentence2)
val_prem_data_indices = token2index_dataset(snli_val.sentence1)
val_hypo_data_indices = token2index_dataset(snli_val.sentence2)

# double checking
print ("Train premise dataset size is {}".format(len(train_prem_data_indices)))
print ("Train hypothesis dataset size is {}".format(len(train_hypo_data_indices)))
print ("Val dataset size is {}".format(len(val_prem_data_indices)))



Train premise dataset size is 100000
Train hypothesis dataset size is 100000
Val dataset size is 1000


In [10]:
train_data_indices = [[train_prem_data_indices[i], train_hypo_data_indices[i]] for i in range(len(train_prem_data_indices))]

val_data_indices = [[val_prem_data_indices[i], val_hypo_data_indices[i]] for i in range(len(val_hypo_data_indices))]



In [11]:
print(train_prem_data_indices[0])
for index in train_prem_data_indices[0]:
    print(idx2words_ft[index])

[107, 803, 1831, 9, 10, 6266, 7168, 4389, 18, 10, 12230, 5336, 10, 564, 7, 359, 5]
A
young
girl
in
a
pink
shirt
sitting
on
a
dock
viewing
a
body
of
water
.


In [12]:
MAX_SENTENCE_LENGTH = 100

import numpy as np
import torch
from torch.utils.data import Dataset

class NewsGroupDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

def newsgroup_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    s1_list = []
    s2_list = []
    label_list = []
    s1_length_list = []
    s2_length_list = []
    
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        if datum[2] == "entailment":
            label_list.append(1)
        if datum[2] == "neutral":
            label_list.append(2)
        if datum[2] == "contradiction":
            label_list.append(3)
#         label_list.append(datum[2])
        s1_length_list.append(len(datum[0][0]))
        s2_length_list.append(len(datum[0][1]))

    # padding
    for datum in batch:
        s1 = len(datum[0][0])
        padded_vec = np.pad(np.array(datum[0][0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-s1)), 
                                mode="constant", constant_values=0)
        s1_list.append(padded_vec)
#         print(datum)
    for datum in batch:
        s2 = len(datum[0][1])
        padded_vec = np.pad(np.array(datum[0][1]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-s2)), 
                                mode="constant", constant_values=0)
        s2_list.append(padded_vec)
#         print(s2_list)


    return [torch.from_numpy(np.array(s1_list)), torch.from_numpy(np.array(s2_list)), 
            torch.LongTensor(s1_length_list), torch.LongTensor(s2_length_list), torch.LongTensor(label_list)]

# create pytorch dataloader
#train_loader = NewsGroupDataset(train_data_indices, train_targets)
#val_loader = NewsGroupDataset(val_data_indices, val_targets)
#test_loader = NewsGroupDataset(test_data_indices, test_targets)

BATCH_SIZE = 32
train_dataset = NewsGroupDataset(train_data_indices, train_targets)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

val_dataset = NewsGroupDataset(val_data_indices, val_targets)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)


for i, (s1, s2, s1_lengths, s2_lengths, labels) in enumerate(train_loader):
    print (s1)
    print (s1_lengths)
    print (labels)
    break

tensor([[  107,   348,  3303,  ...,     0,     0,     0],
        [  107,   364,     9,  ...,     0,     0,     0],
        [ 1897,  4389,    18,  ...,     0,     0,     0],
        ...,
        [ 1084,  1839, 12664,  ...,     0,     0,     0],
        [  107,  6843,  2372,  ...,     0,     0,     0],
        [   23,   995,     9,  ...,     0,     0,     0]])
tensor([24, 28, 13, 10, 15, 14, 14, 16, 13, 10, 14, 12,  8, 10, 25, 11, 18,  8,
         7, 14,  7,  7, 10, 14,  7, 13, 12, 11, 23, 14, 10, 12])
tensor([3, 3, 2, 2, 3, 1, 2, 2, 3, 1, 2, 1, 3, 1, 2, 3, 3, 3, 1, 1, 3, 2, 2, 1,
        3, 1, 3, 2, 1, 2, 3, 2])


### Recurrent Neural Net model

In [11]:
class RNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, num_classes, vocab_size):
        # RNN Accepts the following hyperparams:
        # emb_size: Embedding Size
        # hidden_size: Hidden Size of layer in RNN
        # num_layers: number of layers in RNN
        # num_classes: number of output classes
        # vocab_size: vocabulary size
        super(RNN, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_IDX)
        self.rnn = nn.GRU(emb_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(4*hidden_size, num_classes) ## times 4

    def init_hidden(self, batch_size):
        # Function initializes the activation of recurrent neural net at timestep 0
        # Needs to be in format (num_layers, batch_size, hidden_size)
        hidden = torch.randn(self.num_layers*2, batch_size, self.hidden_size) ## time 2

        return hidden

    def forward(self, s1, s2, s1_lengths, s2_lengths):
        # reset hidden state
        

        s1desc_idx = np.argsort(np.array(s1_lengths))[::-1]
        s2desc_idx = np.argsort(np.array(s2_lengths))[::-1]
        
#         print(np.array(s1_lengths)[desc_idx])
        reordered_s1_lengths = (np.array(s1_lengths)[s1desc_idx])
        reordered_s2_lengths = (np.array(s2_lengths)[s2desc_idx])
        
        s1order = (np.linspace(0, len(s1_lengths), len(s1_lengths), endpoint = False)).astype('int')
        s2order = (np.linspace(0, len(s2_lengths), len(s2_lengths), endpoint = False)).astype('int')
        
        s1_batch_size, s1_seq_len = s1.size()
        s2_batch_size, s2_seq_len = s2.size()

        self.hidden_s1 = self.init_hidden(s1_batch_size)
        self.hidden_s2 = self.init_hidden(s2_batch_size)
        
        # get embedding of characters
        s1_embed = self.embedding(s1)
        s2_embed = self.embedding(s2)
        
        # pack padded sequence
        s1_embed = torch.nn.utils.rnn.pack_padded_sequence(s1_embed, reordered_s1_lengths, batch_first=True)
        s2_embed = torch.nn.utils.rnn.pack_padded_sequence(s2_embed, reordered_s2_lengths, batch_first=True)
        # fprop though RNN
        
        s1reorder = (s1order[s1desc_idx])
        s2reorder = (s2order[s2desc_idx])
        
        reversed1 = np.argsort(s1reorder)
        reversed2 = np.argsort(s2reorder)
        
        
        s1_rnn_out, self.hidden_s1 = self.rnn(s1_embed, self.hidden_s1)
        s2_rnn_out, self.hidden_s2 = self.rnn(s2_embed, self.hidden_s2)
        
        s1_rnn_out, _ = torch.nn.utils.rnn.pad_packed_sequence(s1_rnn_out, batch_first=True)
        s2_rnn_out, _ = torch.nn.utils.rnn.pad_packed_sequence(s2_rnn_out, batch_first=True)
        # sum hidden activations of RNN across time
        
        #unsort it back
        s1_rnn_out = s1_rnn_out[reversed1]
        s2_rnn_out = s2_rnn_out[reversed2]
        
        s1_rnn_out = torch.sum(s1_rnn_out, dim=1)
        s2_rnn_out = torch.sum(s2_rnn_out, dim=1)
        
        
        rnn_out = torch.cat([s1_rnn_out, s2_rnn_out], 1)
        #relu!!!
        rnn_out = self.linear(rnn_out)
        
#         rnn_out, _ = torch.nn.utils.rnn.pad_packed_sequence(rnn_out, batch_first=True)
        # sum hidden activations of RNN across time
#         rnn_out = torch.sum(rnn_out, dim=1)

        return rnn_out


In [66]:
np.argsort(-s1_lengths)

tensor([18, 30,  0,  9, 20, 31,  4, 11, 24, 12, 14, 15, 27, 25, 23,  1, 19, 16,
        28, 26,  3, 22, 29, 21,  7, 13, 17,  2, 10,  8,  5,  6])

In [77]:
print(s1_lengths)
order = np.linspace(0, len(s1_lengths), len(s1_lengths), endpoint = False)
sort = (np.argsort(s1_lengths))
a = (s1_lengths[np.argsort(s1_lengths)])
print(a)
print(order[sort])
print(a[order[sort]])

tensor([24, 13,  8, 12, 17,  8,  7, 10,  8, 22,  8, 17, 15,  9, 15, 14, 13,  9,
        28, 13, 19, 10, 11, 13, 16, 14, 13, 14, 13, 11, 26, 18])
tensor([ 7,  8,  8,  8,  8,  9,  9, 10, 10, 11, 11, 12, 13, 13, 13, 13, 13, 13,
        14, 14, 14, 15, 15, 16, 17, 17, 18, 19, 22, 24, 26, 28])
[ 6.  2.  5.  8. 10. 17. 13.  7. 21. 29. 22.  3. 16. 28. 19. 23.  1. 26.
 15. 25. 27. 14. 12. 24. 11.  4. 31. 20.  9.  0. 30. 18.]
tensor([ 9,  8,  9, 10, 11, 13, 13, 10, 15, 24, 15,  8, 13, 22, 14, 16,  8, 18,
        13, 17, 19, 13, 13, 17, 12,  8, 28, 14, 11,  7, 26, 14])


In [125]:
print(np.array(s1_lengths))
print()
desc_idx = np.argsort(np.array(s1_lengths))[::-1]

print(np.array(s1_lengths)[desc_idx])
print()
sorted_list = (np.array(s1_lengths)[desc_idx])
order = (np.linspace(0, len(s1_lengths), len(s1_lengths), endpoint = False)).astype('int')

print(order)
print()
# reverse1 = np.argsort(sort)

reorder = (order[desc_idx])
print(reorder)
print()

reversed1 = np.argsort(reorder)
print(reversed1)
print()
print(sorted_list[reversed1])
# sort[reverse1]

[11 11 14 13 13 26 10 11  9 18 11  8 23 14 10 24 13 15  8 19  8 14 17 10
 10 14 15  8 13 13 17 10]

[26 24 23 19 18 17 17 15 15 14 14 14 14 13 13 13 13 13 11 11 11 11 10 10
 10 10 10  9  8  8  8  8]

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31]

[ 5 15 12 19  9 22 30 26 17 25  2 13 21  3  4 28 16 29 10  1  7  0 14  6
 23 24 31  8 11 18 20 27]

[21 19 10 13 14  0 23 20 27  4 18 28  2 11 22  1 16  8 29  3 30 12  5 24
 25  9  7 31 15 17  6 26]

[11 11 14 13 13 26 10 11  9 18 11  8 23 14 10 24 13 15  8 19  8 14 17 10
 10 14 15  8 13 13 17 10]


## Important things to keep in mind when using variable sized sequences in RNN in Pytorch

### RNN modules accept packed sequences as inputs
* pack_padded_sequence function packs a sequence (in Tensor format) containing padded sequences of variable length. **IMPORTANT: the sequences should be sorted by length in a decreasing order before passing to this function**

* pad_packed_sequence function is an inverse operation to pack_padded_sequence. Transforms a padded sequence into a tensor of variable lenth sequences

In [12]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for s1, s2, s1_lengths, s2_lengths, labels in loader:
        s1_batch, s2_batch, s1_lengths_batch, s2_lengths_batch, labels_batch = s1, s2, s1_lengths, s2_lengths, labels
        outputs = F.softmax(model(s1_batch, s2_batch, s1_lengths_batch, s2_lengths_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)


model = RNN(emb_size=100, hidden_size=200, num_layers=1, num_classes=3, vocab_size=len(idx2words_ft))

learning_rate = 3e-4
num_epochs = 10 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
total_step = len(train_loader)

for epoch in range(num_epochs):
    for i, (s1, s2, s1_lengths, s2_lengths, labels) in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        # Forward pass
        outputs = model(s1, s2, s1_lengths, s2_lengths)
        loss = criterion(outputs, labels)

        # Backward and optimize
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))


RuntimeError: Assertion `cur_target >= 0 && cur_target < n_classes' failed.  at /Users/soumith/miniconda2/conda-bld/pytorch_1532623076075/work/aten/src/THNN/generic/ClassNLLCriterion.c:93

## Exercise 1:
### Implement LSTM cell instead of RNN cell. Train the model and compare the results.
### Hint (modify init_hidden function and cell in __init__) 

## Exercise 2:
### Implement Bidirectional LSTM. You can do it very easily by adding one argument to cell when you create it.
### For better understanding we recommend that you implement it youself by reversing a sequence and passing it to another cell.

## Exercise 3:

### Add max-pooling (over time) after passing through RNN instead of summing over hidden layers through time

### Now lets implement basic Convolutional Neural Net model for text


In [13]:
class CNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, num_classes, vocab_size):

        super(CNN, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_IDX)
    
        self.conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1)

        self.linear = nn.Linear(hidden_size, num_classes)

    def forward(self, s1, s2, s1_lengths, s2_lengths):
        s1_batch_size, s1_seq_len = s1.size()
        s2_batch_size, s2_seq_len = s2.size()

        s1_embed = self.embedding(s1)
        s2_embed = self.embedding(s2)
        
        s1_hidden = self.conv1(s1_embed.transpose(1,2)).transpose(1,2)
        s2_hidden = self.conv1(s2_embed.transpose(1,2)).transpose(1,2)
        
        s1_hidden = F.relu(s1_hidden.contiguous().view(-1, s1_hidden.size(-1))).view(s1_batch_size, s1_seq_len, s1_hidden.size(-1))
        s2_hidden = F.relu(s2_hidden.contiguous().view(-1, s2_hidden.size(-1))).view(s2_batch_size, s2_seq_len, s1_hidden.size(-1))
        
        s1_hidden = self.conv2(s1_hidden.transpose(1,2)).transpose(1,2)
        s2_hidden = self.conv2(s2_hidden.transpose(1,2)).transpose(1,2)
        
        s1_hidden = F.relu(s1_hidden.contiguous().view(-1, s1_hidden.size(-1))).view(s1_batch_size, s2_seq_len, s1_hidden.size(-1))
        s2_hidden = F.relu(s2_hidden.contiguous().view(-1, s2_hidden.size(-1))).view(s2_batch_size, s2_seq_len, s2_hidden.size(-1))
        
        s1_pool = nn.MaxPool1d(kernel_size = 3)
        s2_pool = nn.MaxPool1d(kernel_size = 3)
        
        s1_hidden = s1_pool(s1_hidden)
        s2_hidden = s2_pool(s2_hidden)
        
        hidden = torch.cat([s1_hidden, s2_hidden], 1)
        
        # Replace F.relu with gated activation
        
        hidden = torch.sum(hidden, dim=1)
#         logits = self.linear(hidden)
        return hidden

## Important things to keep in mind when using Convolutional Nets for Language Tasks in Pytorch

### Conv1d module expect input of size (batch_size, num_channels, length), where in our case input has size (batch_size, length, num_channels). Hence it is important call transpose(1,2) before passing it to convolutional layer and then reshape it back to (batch_size, length, num_channels) by calling transpose(1,2) again

### Additionally we need to reshape hidden activations into 2D tensor before passing it to Relu layer by calling view(-1, hidden.size(-1)

In [14]:
default = []
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for s1, s2, s1_lengths, s2_lengths, labels in loader:
        s1_batch, s2_batch, s1_lengths_batch, s2_lengths_batch, label_batch = s1, s2, s1_lengths, s2_lengths, labels
        outputs = F.softmax(model(s1_batch, s2_batch, s1_lengths_batch, s2_lengths_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)


model = CNN(emb_size=100, hidden_size=200, num_layers=2, num_classes=3, vocab_size=len(idx2words_ft))

learning_rate = 3e-4
num_epochs = 10 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
total_step = len(train_loader)

for epoch in range(num_epochs):
    for i, (s1, s2, s1_lengths, s2_lengths, labels) in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        # Forward pass
        outputs = model(s1, s2, s1_lengths, s2_lengths)
        loss = criterion(outputs, labels)

        # Backward and optimize
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            train_acc = test_model(train_loader, model)
            default.append([val_acc, train_acc])
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))


Epoch: [1/10], Step: [101/3125], Validation Acc: 37.1
Epoch: [1/10], Step: [201/3125], Validation Acc: 41.7
Epoch: [1/10], Step: [301/3125], Validation Acc: 41.4
Epoch: [1/10], Step: [401/3125], Validation Acc: 38.5
Epoch: [1/10], Step: [501/3125], Validation Acc: 39.3
Epoch: [1/10], Step: [601/3125], Validation Acc: 41.9
Epoch: [1/10], Step: [701/3125], Validation Acc: 44.7
Epoch: [1/10], Step: [801/3125], Validation Acc: 45.3
Epoch: [1/10], Step: [901/3125], Validation Acc: 41.5
Epoch: [1/10], Step: [1001/3125], Validation Acc: 42.5
Epoch: [1/10], Step: [1101/3125], Validation Acc: 44.6
Epoch: [1/10], Step: [1201/3125], Validation Acc: 41.9
Epoch: [1/10], Step: [1301/3125], Validation Acc: 42.9
Epoch: [1/10], Step: [1401/3125], Validation Acc: 45.7
Epoch: [1/10], Step: [1501/3125], Validation Acc: 44.9
Epoch: [1/10], Step: [1601/3125], Validation Acc: 44.4
Epoch: [1/10], Step: [1701/3125], Validation Acc: 44.3
Epoch: [1/10], Step: [1801/3125], Validation Acc: 45.9
Epoch: [1/10], Step

KeyboardInterrupt: 

In [15]:
print(default)

[[37.1, 37.65], [41.7, 39.983], [41.4, 41.766], [38.5, 41.676], [39.3, 42.128], [41.9, 43.708], [44.7, 45.526], [45.3, 44.689], [41.5, 42.694], [42.5, 44.472], [44.6, 46.633], [41.9, 42.274], [42.9, 44.986], [45.7, 47.486], [44.9, 48.054], [44.4, 47.017], [44.3, 47.13], [45.9, 47.036], [46.9, 47.569], [47.8, 49.81], [48.4, 49.725], [48.3, 49.361], [48.9, 50.33], [49.7, 51.396], [46.6, 47.781], [47.2, 49.126], [47.6, 50.175], [46.8, 48.507], [48.5, 52.47], [49.1, 51.596], [43.6, 45.149], [48.4, 49.251], [47.4, 50.224], [50.2, 52.536], [48.0, 49.171], [46.3, 48.546], [51.1, 52.607], [51.7, 53.816], [49.4, 51.921], [50.9, 53.546], [52.2, 53.948], [52.3, 53.832], [51.2, 53.264]]


## Exercise 4:
### Implement Gated Relu activations as well as Gated Linear activations and compare them with Relu (reference: https://arxiv.org/pdf/1612.08083.pdf )
### Hint: Gated Relu activations are sigmoid(conv1_1(x)) * relu(conv1_2(x))
### Hint: Gated Linear activations are sigmoid(conv1_1(x)) * conv1_2(x)

### Feel free to play with other variants of gating


## Exercise 5:

### Add max-pooling (over time) after passing through conv as well as add non-linear fully connected layer

## Exercise 6:

### Use Bag-of-Words and Bag-of-NGrams model for this task and compare it with RNN and CNN

## Exercise 7:

### Use FastText for this task