In [90]:
from comet_ml import Experiment
import spacy
import torch.nn as nn
import torch
import torch.nn.functional as F
from torchtext.legacy.data import Field, LabelField, TabularDataset, BucketIterator
import torchtext
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import f1_score
from tqdm import tqdm_notebook as tqdm
from google.cloud import secretmanager
import time
import numpy as np
import json
import pickle
import warnings
# Filter warnings because Pytorch depricating Field and other functions,
# but not clear on what to use instead yet
warnings.filterwarnings('ignore')

In [2]:
# !pip3 install -U pip setuptools wheel
# !pip3 install -U spacy
# !python -m spacy download en_core_web_trf
# !python -m spacy download en_core_web_sm

In [3]:
with open('../data/raw/comet_cred.json') as f:
    comet_creds = json.load(f)

In [40]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.deterministic = True
source_folder = "../data/interim/"

In [5]:
hyper_params = {
    "tokenize": "spacy",
    "lower": True,
    "batch_size": 16,
    "hidden_size": 64,
    "embedding_size": 32,
    "num_classes": 3,
    "num_layers": 1,
    "learning_rate": 0.001,
    "min_freq": 5,
    "bidirectional": True,
    "num_epochs": 75,
    "disabled": False,
    "dropout_p": 0.6,
    "save_model_path": "../models/pytorch_model.pt",
    "save_vocab_path": "../models/pytorch_vocab.pkl"
}

In [6]:
experiment = Experiment(api_key=comet_creds["api_key"], project_name=comet_creds["project_name"],
                        workspace=comet_creds["workspace"], disabled=hyper_params["disabled"])
experiment.log_parameters(hyper_params)

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/jahnic/general/8e7cd35776e84058a5b453f10a901d29



In [76]:
# Specify fields
label_field = LabelField(dtype=torch.float)
text_field = Field(tokenize='spacy', 
                   tokenizer_language='en_core_web_sm',
                   include_lengths=True)

fields = [('label', label_field), ('text', text_field)]

# Define data
train, valid, test = TabularDataset.splits(path=source_folder, train='train_data.csv', validation='dev_data.csv',
                                           test='test_data.csv', format='CSV', skip_header=True, fields = fields)

# Embedding
MAX_VOCAB_SIZE = 25_000

text_field.build_vocab(train, 
                 max_size = MAX_VOCAB_SIZE, 
#                vectors = "fasttext.en.300d",
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_) # initialize words via Gaussian distribution 

label_field.build_vocab(train)

# Create batched iterators of data
BATCH_SIZE = 16

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, valid, test), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device,
    sort = True,
    sort_key=lambda x: len(x.text))

# Validate iterators
print('Train:')
for batch in train_iterator:
    print(b)
    
print('Valid:')
for batch in valid_iterator:
    print(batch)
    
print('Test:')
for batch in test_iterator:
    print(batch)

Train:

[torchtext.legacy.data.batch.Batch of size 16]
	[.label]:[torch.FloatTensor of size 16]
	[.text]:('[torch.LongTensor of size 61x16]', '[torch.LongTensor of size 16]')

[torchtext.legacy.data.batch.Batch of size 16]
	[.label]:[torch.FloatTensor of size 16]
	[.text]:('[torch.LongTensor of size 61x16]', '[torch.LongTensor of size 16]')

[torchtext.legacy.data.batch.Batch of size 16]
	[.label]:[torch.FloatTensor of size 16]
	[.text]:('[torch.LongTensor of size 61x16]', '[torch.LongTensor of size 16]')

[torchtext.legacy.data.batch.Batch of size 16]
	[.label]:[torch.FloatTensor of size 16]
	[.text]:('[torch.LongTensor of size 61x16]', '[torch.LongTensor of size 16]')

[torchtext.legacy.data.batch.Batch of size 16]
	[.label]:[torch.FloatTensor of size 16]
	[.text]:('[torch.LongTensor of size 61x16]', '[torch.LongTensor of size 16]')

[torchtext.legacy.data.batch.Batch of size 16]
	[.label]:[torch.FloatTensor of size 16]
	[.text]:('[torch.LongTensor of size 61x16]', '[torch.LongTensor

	[.text]:('[torch.LongTensor of size 61x16]', '[torch.LongTensor of size 16]')

[torchtext.legacy.data.batch.Batch of size 16]
	[.label]:[torch.FloatTensor of size 16]
	[.text]:('[torch.LongTensor of size 61x16]', '[torch.LongTensor of size 16]')

[torchtext.legacy.data.batch.Batch of size 16]
	[.label]:[torch.FloatTensor of size 16]
	[.text]:('[torch.LongTensor of size 61x16]', '[torch.LongTensor of size 16]')

[torchtext.legacy.data.batch.Batch of size 16]
	[.label]:[torch.FloatTensor of size 16]
	[.text]:('[torch.LongTensor of size 61x16]', '[torch.LongTensor of size 16]')

[torchtext.legacy.data.batch.Batch of size 16]
	[.label]:[torch.FloatTensor of size 16]
	[.text]:('[torch.LongTensor of size 61x16]', '[torch.LongTensor of size 16]')

[torchtext.legacy.data.batch.Batch of size 16]
	[.label]:[torch.FloatTensor of size 16]
	[.text]:('[torch.LongTensor of size 61x16]', '[torch.LongTensor of size 16]')

[torchtext.legacy.data.batch.Batch of size 16]
	[.label]:[torch.FloatTensor of s

	[.text]:('[torch.LongTensor of size 61x16]', '[torch.LongTensor of size 16]')

[torchtext.legacy.data.batch.Batch of size 16]
	[.label]:[torch.FloatTensor of size 16]
	[.text]:('[torch.LongTensor of size 61x16]', '[torch.LongTensor of size 16]')

[torchtext.legacy.data.batch.Batch of size 16]
	[.label]:[torch.FloatTensor of size 16]
	[.text]:('[torch.LongTensor of size 61x16]', '[torch.LongTensor of size 16]')

[torchtext.legacy.data.batch.Batch of size 16]
	[.label]:[torch.FloatTensor of size 16]
	[.text]:('[torch.LongTensor of size 61x16]', '[torch.LongTensor of size 16]')

[torchtext.legacy.data.batch.Batch of size 16]
	[.label]:[torch.FloatTensor of size 16]
	[.text]:('[torch.LongTensor of size 61x16]', '[torch.LongTensor of size 16]')

[torchtext.legacy.data.batch.Batch of size 16]
	[.label]:[torch.FloatTensor of size 16]
	[.text]:('[torch.LongTensor of size 61x16]', '[torch.LongTensor of size 16]')

[torchtext.legacy.data.batch.Batch of size 16]
	[.label]:[torch.FloatTensor of s

In [53]:
# with open(hyper_params["save_vocab_path"], 'wb') as output:
#     pickle.dump(text_field, output)

# torch.save(text_field.vocab_cls, hyper_params["save_vocab_path"])

In [54]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        # lengths need to be on CPU!
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)

In [78]:
INPUT_DIM = len(text_field.vocab)
print(INPUT_DIM)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = text_field.vocab.stoi[text_field.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

16729


In [80]:
def convert_numpy_to_one_hot(np_array):
    onehot = np.zeros((np_array.size, np_array.max()+1))
    onehot[np.arange(np_array.size),np_array] = 1
    return onehot

In [81]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

pretrained_embeddings = text_field.vocab.vectors
print(pretrained_embeddings.shape)

The model has 3,983,557 trainable parameters
torch.Size([16729, 100])


In [82]:
# Replace initial weights with pretrained embeddings
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 1.6616, -1.1721,  1.5593,  ..., -0.1488,  0.6016,  1.2918],
        [ 1.1136,  1.0155, -2.1903,  ..., -0.1738, -1.1674,  1.5161],
        [-0.3398,  0.2094,  0.4635,  ..., -0.2339,  0.4730, -0.0288],
        ...,
        [ 0.2928,  0.5095, -1.9078,  ..., -0.0141, -1.0550, -2.0169],
        [-0.4839,  1.1713, -1.3149,  ...,  1.5802, -0.1153,  0.4947],
        [-0.3436, -0.9013,  0.3693,  ...,  0.7002,  0.7689,  0.7272]])

In [84]:
# Initialize unk and pad tokens to 0 vector instaed of random initialization

UNK_IDX = text_field.vocab.stoi[text_field.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.3398,  0.2094,  0.4635,  ..., -0.2339,  0.4730, -0.0288],
        ...,
        [ 0.2928,  0.5095, -1.9078,  ..., -0.0141, -1.0550, -2.0169],
        [-0.4839,  1.1713, -1.3149,  ...,  1.5802, -0.1153,  0.4947],
        [-0.3436, -0.9013,  0.3693,  ...,  0.7002,  0.7689,  0.7272]])


In [85]:
# Set training paramaters

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [86]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [88]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        text, text_lengths = batch.text
    
        predictions = model(text, text_lengths).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [89]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.text
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [91]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [92]:
# Fit and validate LSTM model

N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 6m 37s
	Train Loss: 0.711 | Train Acc: 54.74%
	 Val. Loss: 0.814 |  Val. Acc: 61.96%
Epoch: 02 | Epoch Time: 1m 57s
	Train Loss: 0.348 | Train Acc: 55.28%
	 Val. Loss: 0.556 |  Val. Acc: 65.01%
Epoch: 03 | Epoch Time: 1m 57s
	Train Loss: -2.027 | Train Acc: 60.64%
	 Val. Loss: -4.093 |  Val. Acc: 65.69%
Epoch: 04 | Epoch Time: 1m 55s
	Train Loss: -8.136 | Train Acc: 59.80%
	 Val. Loss: -6.918 |  Val. Acc: 66.30%
Epoch: 05 | Epoch Time: 1m 55s
	Train Loss: -20.756 | Train Acc: 60.70%
	 Val. Loss: -16.830 |  Val. Acc: 63.18%


In [93]:
# Test best model on test set

model.load_state_dict(torch.load('lstm-model.pt'))
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: -14.201 | Test Acc: 66.30%


In [96]:
# User input

import spacy
nlp = spacy.load('en_core_web_sm')

def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [text_field.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.item()

In [103]:
predict_sentiment(model, "This company is terrible.")

0.07538147270679474

In [105]:
predict_sentiment(model, "Punctual and fast!")

1.0