In [1]:
from torchtext import data
import torch
import torch.nn as nn
import torch.optim as optim
import random
import datetime
from math import floor
import ipdb;

seed = 1
torch.manual_seed(seed)
device = torch.device('cuda')
iterator_device = 0
##uncomment for cpu
#device = torch.device('cpu')
#iterator_device = -1

In [2]:
#add batch_first for CNN
blurb_field = data.Field(sequential=True, use_vocab=True, lower=True, tokenize="spacy", include_lengths=True, batch_first=True)
state_field = data.LabelField(sequential=False, use_vocab=False, tensor_type=torch.FloatTensor,
                              preprocessing=lambda x:1 if x=='successful' else 0)
dataset = data.TabularDataset(path='df_text_eng.csv',format='csv',skip_header=True,fields=[('Unnamed: 0', None),('blurb', blurb_field),('state', state_field)])

In [3]:
train, test, validation = dataset.split(random_state=random.seed(seed), split_ratio=[70,15,15])
print("Training Set Size: ", len(train))
print("Test Set Size: ", len(test))
print("Validation Set Size: ", len(validation))

Training Set Size:  150859
Test Set Size:  32327
Validation Set Size:  32327


In [4]:
#words that appear less than 3 times (2 or less) will be considered unknown words with tag "<unk>", 
#they will have the same word embedding
#vocabulary of the training set will only be used to emulate real world situtaions when the test set is unknown
blurb_field.build_vocab(train,min_freq=3)
print("Vocabulary size used: ",len(blurb_field.vocab))

Vocabulary size used:  28760


In [5]:
batch_size = 64

train_iter = data.BucketIterator(dataset=train, batch_size=batch_size, sort_key=lambda x:len(x.blurb),device=iterator_device,
                                repeat=False, train=True, sort_within_batch=True)
test_iter = data.BucketIterator(dataset=test, batch_size=batch_size, sort_key=lambda x:len(x.blurb),device=iterator_device,
                                train=False, sort_within_batch=True)
validation_iter = data.BucketIterator(dataset=validation, batch_size=batch_size, sort_key=lambda x:len(x.blurb),device=iterator_device,
                                      train=False, sort_within_batch=True)

In [6]:
class CNNNet(nn.Module):
    def __init__(self, vocab_size, embed_size, filters_num, output_size, padding_idx=None, init_embedding=None):
        super(CNNNet, self).__init__()
        if (init_embedding is not None):
            self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=padding_idx, _weight=init_embedding)    # word embedding
        else:
            self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=padding_idx)    # word embedding
        
        self.conv3 = nn.Conv2d(in_channels=1, out_channels=filters_num, kernel_size=(3,embed_size), stride=1, padding=(1,0))
        self.conv4 = nn.Conv2d(in_channels=1, out_channels=filters_num, kernel_size=(4,embed_size), stride=1, padding=(0,0))
        self.conv5 = nn.Conv2d(in_channels=1, out_channels=filters_num, kernel_size=(5,embed_size), stride=1, padding=(2,0))
        self.relu = nn.ReLU();
        
        #put the pool in the forward function as it depends on the input sentence length, so as to choose one feature per activation map/sequence
        #self.pool = nn.MaxPool1d(kernel_size=2, stride=1, padding=0)
        
        #input of filters_nup*3 as we use one feature per filter (after the pooling layer)
        self.out = nn.Linear(filters_num*3, output_size)   # output layer (Fully Connected)
        

    def forward(self, x, x_lengths):
        #x -> [ minibatch size, sentence length(max, smaller sentences are padded)]
        embeds = self.embedding(x)
        
        #embeds -> [minibatch size, sentence length, embedding size]
        embeds = embeds.unsqueeze(1) #make it [minibatch size, 1(number of channels), sentence length, embedding size]
        
        #embeds -> [minibatch size, 1, sentence length, embedding size]
        convOut3 = self.conv3(embeds)
        conv4pad = nn.ZeroPad2d((0,0,1,2)) ##as the conv4 needs an asymmetric padding due to even kernel size
        convOut4 = self.conv4(conv4pad(embeds))
        convOut5 = self.conv5(embeds)
        
        #convOutx -> [minibatch size, filters number for each filter size, sentence length, 1]
        convOut = torch.cat((convOut3, convOut4, convOut5), 1)
        
        #convOut -> [minibatch size, total filters number, sentence length, 1]
        convOut = self.relu(convOut)
        
        #convOut -> [minibatch size, total filters number, sentence length, 1]
        convOut = convOut.squeeze(3)
                
        #convOut -> [minibatch size, total filters number, sentence length]
        #kernel size of sentence length to get one feature per activation map/sequence
        poolOut = nn.functional.max_pool1d(convOut,kernel_size = int(max(x_lengths))) 
        
        #poolOut -> [minibatch size, total filters number, 1]
        #squeeze makes poolOut [minibatch size, total filters number] before entering the fully connected layer
        score = self.out(poolOut.squeeze(2))     
        return score

In [7]:
def calc_accuracy(predictions, y):
    predictions = torch.round(torch.sigmoid(predictions))
    correct = (sum(predictions == y)).float()
    accuracy = correct/len(predictions)
    return accuracy

In [8]:
padding_idx = blurb_field.vocab.stoi['<pad>']
embed_size = 100
filters_num = 100 #filters number per kernel size

In [9]:
cnnmodel = CNNNet(len(blurb_field.vocab),embed_size,filters_num,1, padding_idx=padding_idx)
optimizer = optim.SGD(cnnmodel.parameters(), lr=1e-3)
loss_fn = nn.BCEWithLogitsLoss()
cnnmodel = cnnmodel.to(device)
loss_fn = loss_fn.to(device)

In [10]:
def train(model, iterator, optimizer, loss_fn):
    epoch_loss = 0
    epoch_accuracy = 0
    
    model.train()
    
    for minibatch in iterator:
        optimizer.zero_grad()
        predictions = model(minibatch.blurb[0],minibatch.blurb[1]).squeeze(1)
        loss = loss_fn(predictions, minibatch.state)
        accuracy = calc_accuracy(predictions, minibatch.state)    
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_accuracy += accuracy.item()
        
    return epoch_loss / len(iterator), epoch_accuracy / len(iterator)

In [11]:
def evaluate(model, iterator, loss_fn):
    epoch_loss = 0
    epoch_accuracy = 0
    
    model.eval()
    
    with torch.no_grad():
        for minibatch in iterator:
            predictions = model(minibatch.blurb[0],minibatch.blurb[1]).squeeze(1)
            loss = loss_fn(predictions, minibatch.state)
            accuracy = calc_accuracy(predictions, minibatch.state)    

            epoch_loss += loss.item()
            epoch_accuracy += accuracy.item()
        
    return epoch_loss / len(iterator), epoch_accuracy / len(iterator)

In [12]:
no_epochs = 10
#import ipdb; ipdb.set_trace() # debugging starts here
train_accuracies = []
validation_accuracies = []
init_time = datetime.datetime.now()
for epoch in range(no_epochs):
    train_loss, train_accuracy = train(cnnmodel, train_iter, optimizer, loss_fn)
    validation_loss, validation_accuracy = evaluate(cnnmodel, validation_iter, loss_fn)
    train_accuracies.append(train_accuracy)
    validation_accuracies.append(validation_accuracy)
    current_time = datetime.datetime.now()
    total_time = (current_time-init_time).total_seconds()
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy*100:.2f}%, ' +  
          f'Validation Loss: {validation_loss:.4f}, Validation Accuracy: {validation_accuracy*100:.2f}%')
    print(f'Total Time Passed: {floor(total_time/3600)} hours, {floor(total_time/60)%60} minutes, {total_time%60:.2f} seconds')

  return Variable(arr, volatile=not train), lengths
  return Variable(arr, volatile=not train)


Epoch: 01, Train Loss: 0.6925, Train Accuracy: 52.30%, Validation Loss: 0.6876, Validation Accuracy: 54.28%
Total Time Passed: 0 hours, 4 minutes, 12.94 seconds
Epoch: 02, Train Loss: 0.6853, Train Accuracy: 55.14%, Validation Loss: 0.6821, Validation Accuracy: 56.06%
Total Time Passed: 0 hours, 8 minutes, 26.03 seconds
Epoch: 03, Train Loss: 0.6798, Train Accuracy: 56.74%, Validation Loss: 0.6782, Validation Accuracy: 56.98%
Total Time Passed: 0 hours, 12 minutes, 38.90 seconds
Epoch: 04, Train Loss: 0.6755, Train Accuracy: 57.67%, Validation Loss: 0.6746, Validation Accuracy: 57.91%
Total Time Passed: 0 hours, 16 minutes, 51.67 seconds
Epoch: 05, Train Loss: 0.6717, Train Accuracy: 58.42%, Validation Loss: 0.6717, Validation Accuracy: 58.33%
Total Time Passed: 0 hours, 21 minutes, 4.16 seconds
Epoch: 06, Train Loss: 0.6682, Train Accuracy: 58.96%, Validation Loss: 0.6692, Validation Accuracy: 58.67%
Total Time Passed: 0 hours, 25 minutes, 16.85 seconds
Epoch: 07, Train Loss: 0.6650, 

In [14]:
test_loss, test_accuracy = evaluate(cnnmodel, test_iter, loss_fn)

print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy*100:.2f}%')

  return Variable(arr, volatile=not train), lengths
  return Variable(arr, volatile=not train)


Test Loss: 0.6603, Test Accuracy: 60.33%


In [15]:
import matplotlib.pyplot as plt

plt.plot(range(1,no_epochs+1),train_accuracies)
plt.show()

<Figure size 640x480 with 1 Axes>

In [16]:
with open('CNNModelTraining.txt', 'w') as f:
    for item in train_accuracies:
        f.write("%s\n" % item)

In [17]:
with open('CNNModelVal.txt', 'w') as f:
    for item in validation_accuracies:
        f.write("%s\n" % item)