In [38]:
import torch
import torch.nn as nn

# Embedding module

https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html

In [None]:
torch.nn.Embedding(num_embeddings, # size of the vocabulary
                   embedding_dim, # size of embedding dim
                   padding_idx=None, # special token to padding, returns 0 vector
                   )

In [3]:
# simple lookup module but can be trained

In [39]:
# Since the padding id is also included in the vocabulary, 
# alway add 1 to the size of your vocabulary with the index starting from 1
emb = nn.Embedding(10, 20, 0)

In [40]:
# expects input as a sequence of integers with each representing a particular word...
inp_seq = torch.LongTensor([1, 4, 3, 2, 0, 0])

In [41]:
emb(inp_seq)

tensor([[-0.2826, -0.1858, -0.1663, -0.5926,  0.7455, -0.8056,  0.9270,  1.1362,
          0.4072, -1.0552,  0.0958, -0.7895,  0.3323, -0.1014,  0.0308,  1.2783,
         -0.3603,  0.2703,  0.0093,  1.9665],
        [ 1.1026,  1.6717, -0.0953,  0.5196, -0.6114,  0.2073,  0.2875, -1.5956,
          1.5748,  0.0622,  0.6153,  0.0609, -2.6302,  0.3165,  0.0894,  0.0774,
          1.6418,  0.8907,  0.7228,  0.2788],
        [-0.1668,  0.6591,  2.6490, -0.3813, -1.1069,  1.2718, -0.1496,  0.0862,
          0.3578,  0.7714, -1.7435,  1.1185,  0.5483, -1.2257,  1.6306,  1.5401,
          1.0606,  0.1643,  0.9296,  0.3183],
        [-0.7977,  0.4928,  0.7599, -0.8532,  1.0127, -0.6844,  1.5606, -2.5766,
         -0.5283, -0.0092,  1.5554, -0.4958, -1.1682, -0.0657,  0.5264, -0.6802,
          0.6588,  0.5126, -0.6077,  1.1235],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
      

In [42]:
emb.weight

Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000],
        [-0.2826, -0.1858, -0.1663, -0.5926,  0.7455, -0.8056,  0.9270,  1.1362,
          0.4072, -1.0552,  0.0958, -0.7895,  0.3323, -0.1014,  0.0308,  1.2783,
         -0.3603,  0.2703,  0.0093,  1.9665],
        [-0.7977,  0.4928,  0.7599, -0.8532,  1.0127, -0.6844,  1.5606, -2.5766,
         -0.5283, -0.0092,  1.5554, -0.4958, -1.1682, -0.0657,  0.5264, -0.6802,
          0.6588,  0.5126, -0.6077,  1.1235],
        [-0.1668,  0.6591,  2.6490, -0.3813, -1.1069,  1.2718, -0.1496,  0.0862,
          0.3578,  0.7714, -1.7435,  1.1185,  0.5483, -1.2257,  1.6306,  1.5401,
          1.0606,  0.1643,  0.9296,  0.3183],
        [ 1.1026,  1.6717, -0.0953,  0.5196, -0.6114,  0.2073,  0.2875, -1.5956,
          1.5748,  0.0622,  0.6153,  0.0609, -2.6302,  0.3165,  0

In [43]:
emb.weight.shape

torch.Size([10, 20])

In [44]:
# to freeze the embedding layer set emb.weight.requires_grad to False
emb.weight.requires_grad

True

In [45]:
# FloatTensor containing pretrained weights
weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]])
embedding = nn.Embedding.from_pretrained(weight)
# Get embeddings for index 1
input = torch.LongTensor([1])
embedding(input)

tensor([[4.0000, 5.1000, 6.3000]])

In [46]:
embedding.weight.requires_grad

False

# CNNs

https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html

https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html

Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0)

MaxPool2d(kernel_size, stride=None, padding=0) # default stride = kernel size

In [47]:
inp = torch.randn((1, 224, 224))

In [48]:
class Convnet(nn.Module):
    def __init__(self):
        super(Convnet,self).__init__()
        
        self.conv_1 = nn.Conv2d(1,8,5) # n1 = 8 # for convolution operation
        # input is of size 224 x 224
        # width/height output = (224 - 5) + 1 = 220, output = 8 x 220 x 220 
        # after maxpool operation: output: 8 x 110 x 110
        self.conv_2 = nn.Conv2d(8,16,5) # n2 = 16
        # input is of size 8 x 110 x 110
        # width/height output = (110 - 5) + 1 = 106, output = 16 x 106 x 106 
        # after maxpool operation: output: 16 x 53 x 53
        self.maxpool = nn.MaxPool2d(2,2) # for pooling operation
        self.fc_3 = nn.Linear(16*53*53,128)
        self.fc_4 = nn.Linear(128,1)
        self.relu = nn.ReLU() # Activation function
        
    def forward(self,inp):
        out = self.conv_1(inp)
        out = self.relu(out)
        out = self.maxpool(out)
        out = self.conv_2(out)
        out = self.relu(out)
        out = self.maxpool(out)
        out = out.reshape(inp.shape[0],-1) # flatten the output, first argument is batch-size
        out = self.fc_3(out)
        out = self.relu(out)
        out = self.fc_4(out)
        return out

In [49]:
model = Convnet()

In [50]:
model(inp)

tensor([[-0.0295]], grad_fn=<AddmmBackward0>)

In [51]:
# what about batching
inp = torch.randn((10, 1, 224, 224)) # First dimension is the batch size

In [52]:
model(inp)

tensor([[-0.0439],
        [-0.0116],
        [-0.0440],
        [-0.0400],
        [ 0.0300],
        [-0.0292],
        [-0.0447],
        [-0.0310],
        [-0.0072],
        [-0.0750]], grad_fn=<AddmmBackward0>)

<img src="cnns_text.png">

In [53]:
inp = torch.randn((5, 1, 10, 20)) # 10 words each with an embedding size of 20, batch size 5

In [54]:
conv2 = nn.Conv2d(1, 30, kernel_size=(5, 20)) # the second dimension of the filter is of the size of embedding

In [55]:
out = conv2(inp)

In [56]:
conv2(inp).shape

torch.Size([5, 30, 6, 1])

In [57]:
maxpool = nn.MaxPool2d((2,1))

In [58]:
out = maxpool(out)

In [59]:
out.shape

torch.Size([5, 30, 3, 1])

# Tasks

We will train a CNN based classifier for the sentiment classification task 

In [31]:
import pickle

In [32]:
with open('Sentiment/sentiment_train_X.p', 'rb') as fs:
    train_data = pickle.load(fs)

In [33]:
train_data[0]

['The',
 'Rock',
 'is',
 'destined',
 'to',
 'be',
 'the',
 '21st',
 'Century',
 "'s",
 'new',
 '``',
 'Conan',
 '``',
 'and',
 'that',
 'he',
 "'s",
 'going',
 'to',
 'make',
 'a',
 'splash',
 'even',
 'greater',
 'than',
 'Arnold',
 'Schwarzenegger',
 ',',
 'Jean-Claud',
 'Van',
 'Damme',
 'or',
 'Steven',
 'Segal',
 '.']

In [34]:
with open('Sentiment/sentiment_train_y.p', 'rb') as fs:
    train_label = pickle.load(fs)

In [35]:
train_label[0]

1

1) Considering all the unique words present in the training data as your vocabulary create a word2index mapping.

2) Write a function which takes as input an input takes and returns a list of ids

In [None]:
# input = ['The', 'movie', 'is', 'good']
# output = [10, 15, 2, 4]
# Note that the vocabulary is only made of words in the train set which mean in the val and test set 
# you may encounter new words. You can ignore such words and make sure your function is capable of handling it.
def text2ids():
    pass

3) Write a pytorch dataset class for the sentiment dataset

In [36]:
from torch.utils.data import DataLoader, Dataset

In [None]:
# We were selecting batches previously by creating a generator (Week 2 exercise)
# Pytorch provides a dataset class which allows you to do this in an easier way
# Takes care of the sampling a batch from the training data
# Allows you to shuffle the data at the end of each epoch

In [37]:
len(train_data), len(train_label)

(6920, 6920)

In [None]:
class SentimentData(Dataset):
    def __init__(self, data, label):
        super().__init__()
        self.data = data
        self.label = label
    
    def __len__(self):
        return len(self.label) # should return the size of the data
    
    def __getitem__(self, index): # returns on training instance (x, y)
        inp_text = self.data[index]
        label = self.label[index]
        input_ids = text2ids(inp_text) # execute preprocess code here
        return torch.LongTensor(input_ids), torch.LongTensor(label)

2) Design a CNN based text classifier mode. It should include a embedding module which should be initialized randomly and trained.

In [None]:
class Classifier(nn.Module):
    pass

3) Train the model for 10 epochs. At the end of each epoch, compute validation accuracy and save the model with the best validation accuracy. 

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
max_accuracy = 0
model = Classifier()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss() # Compute only the logits i.e., no sigmoid in the last layer
train_d = SentimentData(train_data, train_label)
val_d = SentimentData(val_data, val_label)
train_loader = DataLoader(train_d, batch_size=16, shuffle=True)
val_loader = DataLoader(val_d, batch_size=16, shuffle=True)
epochs = 10
for e in range(epochs):
    for X, y in train_loader:
        out = model(X)
        loss = criterion(out, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # At the end of each epoch, we compute the validation accuracy
    model.eval() # set the model in eval mode
    for X, y in val_loader:
        out = model(X)
        # output will be logits
        out = torch.sigmoid(out) # map it to a value between 0 and 1
        # we need to now map to a class >=0.5 is 1 and <0.5 is 0
        # Use torch.round
        # Compare with the ground-truth
        # Note that the output will be calculated over a batch
        # Collect the output and the ground-truth class over all the batches
     
    accuracy = accuracy(ground_truth, predicted)
    if accuracy>max_accuracy:
        # save model
        torch.save(clf.state_dict(), 'best_model.pt')
    model.train() # revert back to training mode for the next epoch    

4) Evaluate the model on the test set and report the test accuracy.

In [None]:
best_model = Classifier()
best_model.load_state_dict(torch.load('best_model.pt')) # load the best trained model i.e., one with the highest val acc
test_d = SentimentData(test_data, test_label)
test_loader = DataLoader(test_d, batch_size=16, shuffle=True)

best_model.eval()
with torch.no_grad(): # gradients won't be calculated, saves memory
    for X, y in test_loader:
        # pretty much the same lines of code as in validation
        pass
        