In [0]:
import torch
from torchtext import data
from torchtext import datasets
import random

SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize='spacy')  #define how the review should be processed
LABEL = data.LabelField(dtype=torch.float) #used to process the sentiment

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) #get dataset

train_data, valid_data = train_data.split(random_state=random.seed(SEED))
#split trainset to train/valid
#pass the seed to get the same split each time

# New Section

In [0]:
TEXT.build_vocab(train_data, max_size=25000, vectors="glove.6B.100d")
LABEL.build_vocab(train_data)
#build vocab while keeping 25,000/100,000 most frequent words
#pass argument to download pre trained vectors

In [0]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #place tensors on GPU

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE, 
    device=device)

# create the iterators to go over in the test/validation step
# use bucket iterator which will batch similar length sentences to cut down on padding

In [0]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs,embedding_dim)) for fs in filter_sizes])
        self.fc = nn.Linear(len(filter_sizes)*n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        x = x.permute(1, 0) #batch dim first to please CNN
                
        #x = [batch size, sent len]
        
        embedded = self.embedding(x)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1) #to match channel =1
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conv_n = [batch size, n_filters, sent len - filter_sizes[n]]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim=1)) #dropout on the concatenated filters output

        #cat = [batch size, n_filters * len(filter_sizes)] *always the same size-number of filters
            
        return self.fc(cat)

In [0]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)
#create instance of the CNN class

In [0]:
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings) #load pre train embeddings to emb' layer

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.1897, -0.0174,  0.6258,  ..., -0.3503,  0.0343,  0.8224],
        [-0.1428,  0.2808,  0.9812,  ..., -0.3610,  0.0521,  1.0778],
        [-0.2233, -0.0349,  0.7388,  ...,  0.1977, -0.1103,  0.0074]])

In [0]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters()) #set optimizer do updtae parameters 
#no need to set learning rate, ADAM also adepts learning rate for each parameter unlike SGD

criterion = nn.BCEWithLogitsLoss() #define the loss function

model = model.to(device) #load to GPU
criterion = criterion.to(device) #load to GPU

In [0]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds)) #squase [0,1] and round to close int
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct) #how many rounded prediction match the label avg accross a batch
    return acc

In [0]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train() #set to train mode, turns dropout on
    
    for batch in iterator:
        
        optimizer.zero_grad()#zero out grad att - pytorch does not do it auto' zero after the last calc'
        
        predictions = model(batch.text).squeeze(1) # set to 1 dim [batch size] for loss func
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward() #calculate grad for each param
        
        optimizer.step() #updaet param'
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator) #loss/accuracy avg' across the epoch

In [0]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval() #set to eval mode -turns dropout off
    
    with torch.no_grad(): #no grad calc' in this block -do not update on evaluation
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1) # set to 1 dim [batch size] for loss func
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
N_EPOCHS = 5
train_losses = []
valid_losses = []
train_accuracy = []
valid_accuracy = []

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    train_accuracy.append(train_acc)
    valid_accuracy.append(valid_acc)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

| Epoch: 01 | Train Loss: 0.505 | Train Acc: 73.92% | Val. Loss: 0.337 | Val. Acc: 85.69% |
| Epoch: 02 | Train Loss: 0.310 | Train Acc: 87.20% | Val. Loss: 0.282 | Val. Acc: 88.18% |
| Epoch: 03 | Train Loss: 0.224 | Train Acc: 91.40% | Val. Loss: 0.267 | Val. Acc: 88.79% |
| Epoch: 04 | Train Loss: 0.148 | Train Acc: 94.66% | Val. Loss: 0.262 | Val. Acc: 89.68% |
| Epoch: 05 | Train Loss: 0.094 | Train Acc: 96.93% | Val. Loss: 0.281 | Val. Acc: 89.46% |


In [0]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

| Test Loss: 0.308 | Test Acc: 88.14% |


In [0]:
import spacy
nlp = spacy.load('en')

def predict_sentiment(sentence, min_len=5):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)] #tokanize raw input
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized)) #input must be as long as largest filter -padding is applyied if not
    indexed = [TEXT.vocab.stoi[t] for t in tokenized] #convet to indexed representation of vocab
    tensor = torch.LongTensor(indexed).to(device) #list to tensor
    tensor = tensor.unsqueeze(1)#add batch dim
    prediction = torch.sigmoid(model(tensor)) #squash prediction value [0,1]
    return prediction.item() #covnert tensor holding a single value to int

In [0]:
predict_sentiment("I seen better movies")

0.4536677896976471