In [None]:
pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 11.3MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 39.0MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 25.2MB/s 
Collecting tokenizers==0.8.1.rc1
[?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0M

In [None]:
import torch

import pandas as pd
import random
import numpy as np
from torchtext import data
from torchtext import datasets
import torch.nn as nn
import torch.nn.functional as F
import spacy
from collections import Counter

from transformers import BertTokenizer, BertModel

import torch.optim as optim

import time
import random

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [None]:
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [None]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']

print(max_input_length)

512


In [None]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

In [None]:
tokenize_and_cut('positive and good')

['positive', 'and', 'good']

In [None]:
tokenizer.convert_tokens_to_ids('positive')

3893

In [None]:
TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

SELECTED_TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

LABEL = data.LabelField(dtype = torch.float)

In [None]:
fields = [(None, None), ('text', TEXT), ('selected_text', SELECTED_TEXT), ('label', LABEL)]
#add selected text!

In [None]:
train_data = data.TabularDataset.splits(
                                        path = '',
                                        train = 'train_full.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = True
)

In [None]:
train_data, test_data = train_data[0].split(random_state = random.seed(SEED), split_ratio=0.8)
train_data, valid_data = train_data.split(random_state = random.seed(SEED), split_ratio=0.8)

In [None]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 17588
Number of validation examples: 4397
Number of testing examples: 5496


In [None]:
LABEL.build_vocab(train_data)

In [None]:
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in LABEL vocabulary: 3


In [None]:
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7f5c1c1cf048>, {'neutral': 0, 'positive': 1, 'negative': 2})


In [None]:
BATCH_SIZE = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device, sort=False)

In [None]:
bert = BertModel.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [None]:
class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 n_filters,
                 output_dim,
                 filter_sizes,
                 dropout):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.to_dict()['hidden_size']
        
        self.conv = nn.Conv2d(in_channels = 1, 
                                out_channels = n_filters, 
                                kernel_size = (filter_sizes, embedding_dim))

        
        self.out = nn.Linear(n_filters, output_dim)

        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
                
        with torch.no_grad():
            embedded = self.bert(text)[0]

         
        #embedded = [batch size, sent len, emb dim]
        embedded = embedded.unsqueeze(1)
        
        
        #hidden = [n layers * n directions, batch size, emb dim]
        conved = F.relu(self.conv(embedded).squeeze(3))

        pooled = F.max_pool1d(conved, conved.shape[2]).squeeze(2)
        #hidden = [batch size, hid dim]
        cat = self.dropout(pooled)
          
        #output = [batch size, out dim]
        output = self.out(cat) 

        return output

In [None]:
N_FILTERS = 100
FILTER_SIZES = 3
OUTPUT_DIM = 2
DROPOUT = 0.5

model = BERTGRUSentiment(bert,
                         N_FILTERS,
                         OUTPUT_DIM,
                         FILTER_SIZES,
                         DROPOUT)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 109,712,942 trainable parameters


In [None]:
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 230,702 trainable parameters


In [None]:
for name, param in model.named_parameters():                
    if param.requires_grad:
        print(name)

conv.weight
conv.bias
out.weight
out.bias


In [None]:
optimizer = optim.Adam(model.parameters())

In [None]:
#criterion = nn.CrossEntropyLoss()
criterion = nn.MSELoss()

In [None]:
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def categorical_accuracy(preds, y):
    # preds is output (3 values)
    # y is other vector of right labels
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y) # return a vector, 1 if the two cells are equal
    return correct.sum() / torch.FloatTensor([y.shape[0]])

In [None]:
def intersection(lst1, lst2): 
    #return [value for value in lst1 if value in lst2] 
    return list((Counter(lst1.tolist()) & Counter(lst2.tolist())).elements()) 

In [None]:
def jaccard_similarity(list1,list2):
    lenIntersection=len(intersection(list1,list2))
    union=(len(list1)+len(list2)) - lenIntersection
    if union == 0:
        return 0.0
    else:
        return float(lenIntersection)/union

In [None]:
#calculating jaccard for pytorch tensors
def jaccard(text, pred, y):
    jaccard = 0
    for i, v in enumerate(text):
        jacc_value = jaccard_similarity(v[pred[i][0].round().int().item():pred[i][1].round().int().item()],
                                      v[y[i][0].round().int().item():y[i][1].round().int().item()])
        jaccard += jacc_value
    result = jaccard / torch.FloatTensor([text.shape[0]])
    return result

In [None]:
#converting labels into bert ids
def LabelToId(x):
    return tokenizer.convert_tokens_to_ids(LABEL.vocab.itos[x])

In [None]:
#init token + label id + pad token + pad token + text ids
def process(text, label):
    result = torch.tensor(np.full((label.shape[0],1), init_token_idx), dtype=torch.int64, device=device)
    pad = torch.tensor(np.full((label.shape[0],1), pad_token_idx), dtype=torch.int64, device=device)
    labels_idx = torch.empty((label.shape), dtype=torch.int64, device=device)
    for i, v in enumerate(label):
        labels_idx[i] = LabelToId(int(v.item()))
    labels_idx = labels_idx.unsqueeze(1)
    result = torch.cat((result, labels_idx), 1)
    result = torch.cat((result, pad) , 1)
    result = torch.cat((result, pad) , 1)

    result = torch.cat((result, text[:, 1:]), 1)

    return result

In [None]:
#getting begin position and end position of selected text in original text
def get_positions(text, selected_text):
    selected_text_ = selected_text[:, 1:]
    text_ = text[:, 1:]
    result = torch.empty((1,2), dtype=torch.int64)
    for i, v in enumerate(text_):
        start_position = 0
        end_position = 0
        for j in range(v.shape[0]):
            if selected_text_[i][0].item() == v[j].item():
                for k in range(selected_text_[i].shape[0]):
                    if selected_text_[i][k].item() == v[j+k].item():
                        if k==0:
                            start_position = j
                        if selected_text_[i][k].item() == eos_token_idx:
                            end_position = j+k
                            break
                    else:
                        end_position = j+k
                        break
        start_position += 4
        end_position += 4
        positions = torch.tensor([[start_position, end_position]], dtype=torch.int64)
        result = torch.cat((result, positions), 0)
    return result[1:,:]

In [None]:
#creating random positions as baseline
def create_baseline(text, pos):
    result = torch.empty((pos.shape))
    for i, v in enumerate(text):
        first = random.randint(4, (v==eos_token_idx).nonzero().item())
        second = random.randint(first, (v==eos_token_idx).nonzero().item())
        result[i] = torch.tensor([[first, second]])
    return result

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_jacc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        input = process(batch.text, batch.label)

        positions = get_positions(batch.text, batch.selected_text).float().to(device)

        predictions = model(input)
        
        loss = criterion(predictions, positions)
        
        jacc = jaccard(input, predictions, positions)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_jacc += jacc.item()
        
    return epoch_loss / len(iterator), epoch_jacc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_jacc = 0
    epoch_jacc_baseline = 0
    epoch_loss_baseline = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            input = process(batch.text, batch.label)
            positions = get_positions(batch.text, batch.selected_text).float().to(device)

            predictions = model(input)
            
            loss = criterion(predictions, positions)
            
            jacc = jaccard(input, predictions, positions)

            baseline = create_baseline(input, positions).to(device)
            jacc_baseline = jaccard(input, baseline, positions)
            loss_baseline = criterion(baseline, positions)

            epoch_loss += loss.item()
            epoch_jacc += jacc.item()
            epoch_jacc_baseline += jacc_baseline.item()
            epoch_loss_baseline += loss_baseline.item()
        
    return epoch_loss / len(iterator), epoch_jacc / len(iterator), epoch_loss_baseline / len(iterator), epoch_jacc_baseline / len(iterator)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
train_loss_plot = []
train_jacc_plot = []
valid_loss_plot = []
valid_jacc_plot = []

In [None]:
N_EPOCHS = 20

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss, train_jacc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_jacc, baseline_loss, baseline_jacc = evaluate(model, valid_iterator, criterion)
    train_loss_plot.append(train_loss)
    train_jacc_plot.append(train_jacc)
    valid_loss_plot.append(valid_loss)
    valid_jacc_plot.append(valid_jacc)
        
    end_time = time.time()
        
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'bert-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Jacc: {train_jacc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Jacc: {valid_jacc*100:.2f}%')
    print(f'\t  BL. Loss: {baseline_loss:.3f} |   BL. Jacc: {baseline_jacc*100:.2f}%')

Epoch: 01 | Epoch Time: 1m 42s
	Train Loss: 61.472 | Train Jacc: 28.92%
	 Val. Loss: 42.141 |  Val. Jacc: 34.51%
	  BL. Loss: 91.472 |   BL. Jacc: 16.10%
Epoch: 02 | Epoch Time: 1m 42s
	Train Loss: 49.951 | Train Jacc: 32.00%
	 Val. Loss: 40.172 |  Val. Jacc: 36.69%
	  BL. Loss: 90.655 |   BL. Jacc: 16.62%
Epoch: 03 | Epoch Time: 1m 42s
	Train Loss: 47.759 | Train Jacc: 32.90%
	 Val. Loss: 38.190 |  Val. Jacc: 34.71%
	  BL. Loss: 91.529 |   BL. Jacc: 16.26%
Epoch: 04 | Epoch Time: 1m 42s
	Train Loss: 45.701 | Train Jacc: 33.16%
	 Val. Loss: 36.902 |  Val. Jacc: 37.10%
	  BL. Loss: 88.748 |   BL. Jacc: 15.97%
Epoch: 05 | Epoch Time: 1m 42s
	Train Loss: 44.755 | Train Jacc: 33.22%
	 Val. Loss: 36.918 |  Val. Jacc: 33.24%
	  BL. Loss: 94.030 |   BL. Jacc: 15.71%
Epoch: 06 | Epoch Time: 1m 42s
	Train Loss: 43.775 | Train Jacc: 33.54%
	 Val. Loss: 36.336 |  Val. Jacc: 38.12%
	  BL. Loss: 90.930 |   BL. Jacc: 16.14%
Epoch: 07 | Epoch Time: 1m 42s
	Train Loss: 43.368 | Train Jacc: 33.81%
	 Va

In [None]:
model.load_state_dict(torch.load('bert-model.pt'))

test_loss, test_acc, _ , _ = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Jacc: {test_acc*100:.2f}%')

Test Loss: 36.969 | Test Jacc: 38.90%


In [None]:
def predict_sentiment(model, tokenizer, sentence, label):
    model.eval()
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    indexed = [init_token_idx] + [tokenizer.convert_tokens_to_ids(label)] +  [pad_token_idx] + [pad_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = model(tensor)
    #max_preds = prediction.argmax(dim = 1)
    return prediction, indexed
    #return max_preds.item()

In [None]:
sentence = "I am very sorry. Today is very ugly."
label = "negative"
pred_class, indexed = predict_sentiment(model, tokenizer, sentence , label)
result = [tokenizer.convert_ids_to_tokens(i) for i in indexed[pred_class[0][0].round().int().item():pred_class[0][1].round().int().item()]]
print(result)

['very']
