In [1]:
import torch
import torch.nn as nn

In [2]:
from tqdm.notebook import tqdm

# Tasks

We will train a CNN based classifier for the sentiment classification task 

In [3]:
import pickle

In [4]:
with open('Sentiment/sentiment_train_X.p', 'rb') as fs:
    train_data = pickle.load(fs)

In [5]:
with open('Sentiment/sentiment_train_y.p', 'rb') as fs:
    train_label = pickle.load(fs)

In [6]:
with open('Sentiment/sentiment_val_X.p', 'rb') as fs:
    val_data = pickle.load(fs)

In [7]:
with open('Sentiment/sentiment_val_y.p', 'rb') as fs:
    val_label = pickle.load(fs)

In [8]:
with open('Sentiment/sentiment_test_X.p', 'rb') as fs:
    test_data = pickle.load(fs)

In [9]:
with open('Sentiment/sentiment_test_y.p', 'rb') as fs:
    test_label = pickle.load(fs)

1) Considering all the unique words present in the training data as your vocabulary create a word2index mapping.

In [10]:
word2index = {}
ind = 1
for i, sent in tqdm(enumerate(train_data)):
    for word in sent:
        if word in word2index:
            continue
        else:
            word2index[word] = ind
            ind+=1

0it [00:00, ?it/s]

In [11]:
len(word2index)

16273

In [12]:
sum([len(sent) for sent in train_data])/len(train_data) # average length of sentences

19.309104046242776

In [13]:
max([len(sent) for sent in train_data])

53

2) Write a function which takes as input an input takes and returns a list of ids

In [33]:
# input = ['The', 'movie', 'is', 'good']
# output = [10, 15, 2, 4]
# Note that the vocabulary is only made of words in the train set which mean in the val and test set 
# you may encounter new words. You can ignore such words and make sure your function is capable of handling it.
def text2ids(sent, max_len=30):
    enc = [word2index[word] for word in sent if word in word2index]
    if len(enc)<max_len:
        return enc + [0 for _ in range(max_len - len(enc))]
    else:
        return enc[:max_len]

In [16]:
text2ids(["this", "movie", "is", "good"])

[109,
 232,
 3,
 134,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

3) Write a pytorch dataset class for the sentiment dataset

In [18]:
from torch.utils.data import DataLoader, Dataset

In [36]:
class SentimentData(Dataset):
    def __init__(self, data, labels):
        super().__init__()
        self.data = data
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, index):
        inp_text = self.data[index]
        label = self.labels[index]
        input_ids = text2ids(inp_text) # execute preprocess code here
        return torch.LongTensor(input_ids), torch.FloatTensor([label])

2) Design a CNN based text classifier mode. It should include a embedding module which should be initialized randomly and trained.

In [20]:
class Convnet(nn.Module):
    def __init__(self):
        super(Convnet,self).__init__()
        self.embed = nn.Embedding(num_embeddings=len(word2index)+1,embedding_dim=200,padding_idx=0)
        self.conv_1 = nn.Conv1d(200,200,5) # 200 filters of size 5
        self.conv_2 = nn.Conv1d(200,200,4) # 200 filters of size 4
        self.conv_3 = nn.Conv1d(200,200,3) # 200 filters of size 4
        self.fc_3 = nn.Linear(600,1000)
        self.fc_4 = nn.Linear(1000,1)
        self.relu = nn.ReLU() # Activation function
        
    def forward(self,inp):
        inp = self.embed(inp)
        inp = inp.transpose(2,1)
        out_1 = self.conv_1(inp)
        out_2 = self.conv_2(inp)
        out_3 = self.conv_3(inp)
        out_1 = self.relu(out_1)
        out_2 = self.relu(out_2)
        out_3 = self.relu(out_3)
        out_1 = torch.max(out_1, dim=2)
        out_2 = torch.max(out_2, dim=2)
        out_3 = torch.max(out_3, dim=2)
        #print(out_1[0].shape)
        out = torch.cat((out_1[0], out_2[0], out_3[0]), dim=1)
        out = self.fc_3(out)
        out = self.relu(out)
        out = self.fc_4(out)
        return out

3) Train the model for 10 epochs. At the end of each epoch, compute validation accuracy and save the model with the best validation accuracy. 

In [21]:
from sklearn.metrics import accuracy_score

In [37]:
max_accuracy = 0
train_dataset = SentimentData(train_data, train_label)
val_dataset = SentimentData(val_data, val_label)
clf = Convnet()
train_loader = DataLoader(train_dataset, batch_size=16)
val_loader = DataLoader(val_dataset, batch_size=32)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(clf.parameters(), lr=0.001)
epochs = 10
for e in tqdm(range(epochs)):
    for X, y in train_loader:
        out = clf(X)
        loss = criterion(out.reshape(-1), y.reshape(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    
    clf.eval()
    pred_class = []
    true_class = []
    with torch.no_grad():
        for X, y in val_loader:
            out = clf(X)
            #pred_class.extend(torch.a(out, dim=1).reshape(-1).numpy().tolist())
            pred_class.extend(torch.round(torch.sigmoid(out)).reshape(-1).numpy().tolist())
            true_class.extend(y.reshape(-1).numpy().tolist())
    
    score = accuracy_score(true_class, pred_class)
    print(f'validation accuracy at the end of epoch {e}: {score}')
    if score > max_accuracy:
        max_accuracy = score
        torch.save(clf.state_dict(), 'best_model.pt')
    clf.train()    

  0%|          | 0/10 [00:00<?, ?it/s]

validation accuracy at the end of epoch 0: 0.4908256880733945
validation accuracy at the end of epoch 1: 0.588302752293578
validation accuracy at the end of epoch 2: 0.6399082568807339
validation accuracy at the end of epoch 3: 0.6674311926605505
validation accuracy at the end of epoch 4: 0.6410550458715596
validation accuracy at the end of epoch 5: 0.6892201834862385
validation accuracy at the end of epoch 6: 0.6743119266055045
validation accuracy at the end of epoch 7: 0.6548165137614679
validation accuracy at the end of epoch 8: 0.6227064220183486
validation accuracy at the end of epoch 9: 0.5928899082568807


4) Evaluate the model on the test set and report the test accuracy.

In [38]:
clf_test = Convnet()
clf_test.load_state_dict(torch.load('best_model.pt'))

<All keys matched successfully>

In [39]:
clf_test.eval()
test_dataset = SentimentData(test_data, test_label)
test_loader = DataLoader(test_dataset, batch_size=32)
pred_class = []
true_class = []
with torch.no_grad():
    for X, y in test_loader:
        out = clf(X)
        pred_class.extend(torch.round(torch.sigmoid(out)).reshape(-1).numpy().tolist())
        true_class.extend(y.reshape(-1).numpy().tolist())
        
score = accuracy_score(true_class, pred_class)
print(f'Best test accuracy: {score}')

Best test accuracy: 0.5930807248764415
