In [None]:
!git clone https://github.com/l1905kw/nlp-class-project.git

Cloning into 'nlp-class-project'...
remote: Enumerating objects: 29, done.[K
remote: Counting objects: 100% (29/29), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 393 (delta 7), reused 25 (delta 7), pack-reused 364[K
Receiving objects: 100% (393/393), 249.91 MiB | 25.42 MiB/s, done.
Resolving deltas: 100% (95/95), done.
Checking out files: 100% (77/77), done.


In [None]:
!python -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [None]:
!pip install captum

Collecting captum
[?25l  Downloading https://files.pythonhosted.org/packages/42/de/c018e206d463d9975444c28b0a4f103c9ca4b2faedf943df727e402a1a1e/captum-0.2.0-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 3.3MB/s 
Installing collected packages: captum
Successfully installed captum-0.2.0


In [None]:
import spacy

import torch
import torchtext
import torchtext.data
import torch.nn as nn
import torch.nn.functional as F

from torchtext.vocab import Vocab

from captum.attr import LayerIntegratedGradients, TokenReferenceBase, visualization

nlp = spacy.load('en')


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        embedded = self.embedding(text)
        #embedded = [batch size, sent len, emb dim]
        embedded = embedded.unsqueeze(1)
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
                
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))
        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [None]:
TEXT = torchtext.data.Field(lower=True, tokenize=str.split, use_vocab=True, batch_first=True)
# E vs. I
#LABEL = torchtext.data.Field(is_target=True, preprocessing=lambda x: 0 if x[0]=='I' else 1)
# 16 classes
LABEL = torchtext.data.Field(use_vocab=True, is_target=True, batch_first=True)

In [None]:
from torchtext.data import TabularDataset

'''
train_data, valid_data, test_data = TabularDataset.splits(path='nlp-class-project/preprocess_new',
                                   #train='split_train/mbti.tsv',
                                   #validation='split_val/mbti.tsv',
                                   train='original_train/mbti.tsv',
                                   validation='original_val/mbti.tsv',
                                   test='original_test/mbti.tsv',
                                   format='tsv',
                                   fields=[('label', LABEL), ('text', TEXT)])


train_data, valid_data, test_data = TabularDataset.splits(path='nlp-class-project/gina/',
                                   train='personal_data_train.tsv',
                                   validation='personal_data_valid.tsv',
                                   test='personal_data_test.tsv',
                                   format='tsv',
                                   fields=[('label', LABEL), ('text', TEXT)])
'''

train_data, valid_data, test_data = TabularDataset.splits(path='nlp-class-project/gina/',
                                   train='personal_data_aug_train.tsv',
                                   validation='personal_data_aug_valid.tsv',
                                   test='personal_data_aug_test.tsv',
                                   format='tsv',
                                   fields=[('label', LABEL), ('text', TEXT)])


In [None]:
TEXT.build_vocab(train_data)

# 16 classes
LABEL.build_vocab(train_data)
output_dim = len(LABEL.vocab)

#print(len(LABEL.vocab))


In [None]:
print(train_data[0].text)
print(train_data[0].label)
print(valid_data[0].text)
print(valid_data[0].label)
print('train:', len(train_data))
print('valid:', len(valid_data))
print('test:', len(test_data))

['welcome', 'my', 'introverted', 'friend', '.', '=', ')', '[sep]', 'i', 'would', 'have', 'to', 'say', 'that', 'i', 'have', 'been', 'prone', 'to', 'addiction,', 'all', 'it', 'takes', 'is', 'one', 'time', 'and', 'then', "i'm", 'so', 'hungry', 'to', 'do', 'it', 'again', 'that', 'i', 'keep', 'it', 'up', 'and', 'tend', 'to', 'not', 'consider', 'stopping', 'for', 'several', 'years', 'at', 'a', '.', '.', '.', '[sep]', 'i', 'would', 'say', 'that', 'my', 'main', 'thing', 'would', 'be', 'people', 'that', "don't", 'take', 'time', 'to', 'truly', 'get', 'to', 'know', 'me', 'and', 'understand', 'me,', 'which', 'yes', 'does', 'require', 'trying', 'to', 'see', 'things', 'from', 'my', 'point', 'of', 'view', '.', '[sep]', 'i', 'think', "it's", 'just', 'an', 'enfp', 'quality', 'really', '.', '.', '.', "i've", 'always', 'been', 'that', 'way', 'and', 'from', 'time', 'to', 'time', 'it', 'lands', 'me', 'into', 'some', 'serious', 'trouble', '.', 'xp', '[sep]', 'love', 'it', '.', "i'm", 'sixteen', 'ans', "that

In [None]:
from torchtext.data import Iterator, BucketIterator

train_iter, valid_iter, test_iter = BucketIterator.splits((train_data, valid_data, test_data),
                                               #batch_sizes=(256, 256, 256),
                                               batch_sizes=(128, 128, 128),
                                               #sort_key=lambda x: len(x.text),
                                               sort=False,
                                               device=device,
                                               sort_within_batch=False)


In [None]:
model = CNN(vocab_size = len(TEXT.vocab), 
            embedding_dim = 256,
            n_filters = 100,
            filter_sizes = [1,2,3,4,5],
            output_dim = output_dim,
            dropout=0.1,
            pad_idx = TEXT.vocab.stoi['<pad>'])
model.cuda()

CNN(
  (embedding): Embedding(139587, 256, padding_idx=1)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(1, 256), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(2, 256), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(3, 256), stride=(1, 1))
    (3): Conv2d(1, 100, kernel_size=(4, 256), stride=(1, 1))
    (4): Conv2d(1, 100, kernel_size=(5, 256), stride=(1, 1))
  )
  (fc): Linear(in_features=500, out_features=18, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [None]:
from collections import defaultdict

def get_balanced_accuracy(ref_results, hyp_results):
  num_corrects = defaultdict(int)
  num_reference = defaultdict(int)
  num_extract = defaultdict(int)
  for r, h in zip(ref_results, hyp_results):
    num_reference[r] = num_reference.get(r, 0) + 1
    num_extract[h] = num_extract.get(h, 0) + 1
    if r == h:
      num_corrects[r] = num_corrects.get(r, 0) + 1
    
  sum_accuracy = macro_f1 = 0.
  accuracies = {}
  
  for type in num_reference.keys():
    sum_accuracy += num_corrects[type] / num_reference[type]
    accuracies[type] = num_corrects[type] / num_reference[type]
    try:
      precision = num_corrects[type] / num_reference[type]
      recall = num_corrects[type] / num_extract[type]
      macro_f1 += (2 * precision * recall) / (precision + recall)
    except ZeroDivisionError:
      continue
  macro_f1 /= len(num_reference.keys())
  return sum_accuracy / len(num_corrects.keys()), accuracies, num_corrects, num_reference, macro_f1


balanced_accuracy, type_accuracy, type_correct, type_gold, macro_f1 = get_balanced_accuracy([1,2,3,4,5,1,2,3,4,5], [1,1,2,2,3,3,4,4,5,5])
print(balanced_accuracy)
print(type_accuracy)
print(macro_f1)


0.2
{1: 0.5, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.5}
0.2


In [None]:
import torch.optim as optim
from tqdm.notebook import tqdm
import torch.nn.functional as F

opt = optim.Adam(model.parameters(), lr=1e-3)
#loss_func = nn.CrossEntropyLoss()

epochs = 20

for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    
    for i, batch in enumerate(tqdm(train_iter)):
        model.train()
        opt.zero_grad()

        preds = model(batch.text)
        target = batch.label.squeeze()
        loss = F.cross_entropy(preds, target)
        
        loss.backward()
        opt.step()

        running_loss += loss.data
        if (i+1) % 100 == 0:
          corrects = (torch.max(preds, 1)[1].view(target.size()).data == target.data).sum()
          accuracy = 100.0 * corrects/batch.batch_size
          print('Epoch {} step {} Acc.: {:.4f} running Loss: {:.4f}'.format(epoch, i+1, accuracy, loss.data))
        
        if (i+1) % 1000 == 0:
          # calculate the validation loss for this epoch
          val_loss = 0.0
          model.eval() # turn on evaluation mode
          corrects = num_val = 0
          for batch in tqdm(valid_iter):
              preds = model(batch.text)
              target = batch.label.squeeze()
              loss = F.cross_entropy(input=preds, target=target)
              val_loss += loss.data
              corrects += (torch.max(preds, 1)[1].view(target.size()).data == target.data).sum()
              num_val += batch.batch_size

          val_loss /= len(valid_iter)
          accuracy = 100.0 * corrects/num_val
          print('Epoch: {} Accuracy: {} Validation Loss: {:.4f}'.format(epoch, accuracy, val_loss))

    epoch_loss = running_loss / len(train_iter)
    print('Epoch: {}  Training Loss: {:.4f}'.format(epoch, epoch_loss))


    # calculate the validation loss for this epoch
    val_loss = 0.0
    model.eval() # turn on evaluation mode
    corrects = num_val = 0

    all_preds = []
    all_targets = []
    for batch in tqdm(valid_iter):
        preds = model(batch.text)
        target = batch.label.squeeze()
        loss = F.cross_entropy(input=preds, target=target)
        val_loss += loss.data
        corrects += (torch.max(preds, 1)[1].view(target.size()).data == target.data).sum()
        num_val += batch.batch_size
        all_preds.extend(torch.max(preds, 1)[1].view(target.size()).data.tolist())
        all_targets.extend(target.data.tolist())

    val_loss /= len(valid_iter)
    accuracy = 100.0 * corrects/num_val
    balanced_accuracy, type_accuracy, type_correct, type_gold, macro_f1 = get_balanced_accuracy(all_targets, all_preds)
    print('Epoch: {} Accuracy: {:.4f} Validation Loss: {:.4f} ({}/{})'.format(epoch, accuracy, val_loss, corrects, num_val))
    print('Macro f1: {:.4f}'.format(macro_f1 * 100))
    print('Balanced Accuracy: {:.4f}'.format(balanced_accuracy * 100))
    print('Accuracy by Type: {} types'.format(len(type_accuracy.keys())))
    for type in type_accuracy.keys():
      print('{} {:.4f} ({}/{})'.format(LABEL.vocab.itos[type], type_accuracy[type], type_correct[type], type_gold[type]))
    torch.save(model.state_dict(), "drive/My Drive/mbti_aug_person_cnn_model_{}.pt".format(epoch))


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

Epoch 1 step 100 Acc.: 45.3125 running Loss: 1.8221

Epoch: 1  Training Loss: 1.7452


HBox(children=(FloatProgress(value=0.0, max=55.0), HTML(value='')))


Epoch: 1 Accuracy: 46.6715 Validation Loss: 1.7468 (3239/6940)
Macro f1: 26.3171
Balanced Accuracy: 24.9960
Accuracy by Type: 16 types
ENFJ 0.0719 (10/139)
ENTP 0.2535 (143/564)
INFP 0.5962 (871/1461)
INTP 0.7998 (851/1064)
ENFP 0.4575 (253/553)
INTJ 0.4455 (388/871)
ISTP 0.4084 (107/262)
INFJ 0.4442 (521/1173)
ISFP 0.1872 (38/203)
ISTJ 0.2229 (35/157)
ISFJ 0.0000 (0/146)
ENTJ 0.1122 (22/196)
ESFP 0.0000 (0/34)
ESFJ 0.0000 (0/33)
ESTJ 0.0000 (0/30)
ESTP 0.0000 (0/54)


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

Epoch 2 step 100 Acc.: 53.1250 running Loss: 1.5412

Epoch: 2  Training Loss: 1.5104


HBox(children=(FloatProgress(value=0.0, max=55.0), HTML(value='')))


Epoch: 2 Accuracy: 49.8127 Validation Loss: 1.6424 (3457/6940)
Macro f1: 31.8865
Balanced Accuracy: 29.6966
Accuracy by Type: 16 types
ENFJ 0.2086 (29/139)
ENTP 0.4167 (235/564)
INFP 0.6810 (995/1461)
INTP 0.6156 (655/1064)
ENFP 0.3653 (202/553)
INTJ 0.4604 (401/871)
ISTP 0.3550 (93/262)
INFJ 0.5558 (652/1173)
ISFP 0.2956 (60/203)
ISTJ 0.2038 (32/157)
ISFJ 0.2671 (39/146)
ENTJ 0.3265 (64/196)
ESFP 0.0000 (0/34)
ESFJ 0.0000 (0/33)
ESTJ 0.0000 (0/30)
ESTP 0.0000 (0/54)


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

Epoch 3 step 100 Acc.: 47.6562 running Loss: 1.4612

Epoch: 3  Training Loss: 1.2930


HBox(children=(FloatProgress(value=0.0, max=55.0), HTML(value='')))


Epoch: 3 Accuracy: 49.4957 Validation Loss: 1.6394 (3435/6940)
Macro f1: 31.9895
Balanced Accuracy: 28.9100
Accuracy by Type: 16 types
ENFJ 0.2662 (37/139)
ENTP 0.3245 (183/564)
INFP 0.5927 (866/1461)
INTP 0.7688 (818/1064)
ENFP 0.4412 (244/553)
INTJ 0.4592 (400/871)
ISTP 0.2481 (65/262)
INFJ 0.5746 (674/1173)
ISFP 0.1921 (39/203)
ISTJ 0.1274 (20/157)
ISFJ 0.2192 (32/146)
ENTJ 0.2449 (48/196)
ESFP 0.0000 (0/34)
ESFJ 0.0000 (0/33)
ESTJ 0.0000 (0/30)
ESTP 0.1667 (9/54)


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

Epoch 4 step 100 Acc.: 63.2812 running Loss: 1.1752

Epoch: 4  Training Loss: 1.0690


HBox(children=(FloatProgress(value=0.0, max=55.0), HTML(value='')))


Epoch: 4 Accuracy: 49.3804 Validation Loss: 1.6700 (3427/6940)
Macro f1: 33.0749
Balanced Accuracy: 30.5628
Accuracy by Type: 16 types
ENFJ 0.2734 (38/139)
ENTP 0.4238 (239/564)
INFP 0.8056 (1177/1461)
INTP 0.4727 (503/1064)
ENFP 0.3617 (200/553)
INTJ 0.4707 (410/871)
ISTP 0.5382 (141/262)
INFJ 0.4910 (576/1173)
ISFP 0.2118 (43/203)
ISTJ 0.2803 (44/157)
ISFJ 0.1575 (23/146)
ENTJ 0.0969 (19/196)
ESFP 0.0000 (0/34)
ESFJ 0.1212 (4/33)
ESTJ 0.0000 (0/30)
ESTP 0.1852 (10/54)


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

Epoch 5 step 100 Acc.: 75.7812 running Loss: 0.7899

Epoch: 5  Training Loss: 0.8406


HBox(children=(FloatProgress(value=0.0, max=55.0), HTML(value='')))


Epoch: 5 Accuracy: 49.5677 Validation Loss: 1.6535 (3440/6940)
Macro f1: 35.2856
Balanced Accuracy: 32.9583
Accuracy by Type: 16 types
ENFJ 0.3309 (46/139)
ENTP 0.3422 (193/564)
INFP 0.5421 (792/1461)
INTP 0.5310 (565/1064)
ENFP 0.5280 (292/553)
INTJ 0.5373 (468/871)
ISTP 0.3702 (97/262)
INFJ 0.6607 (775/1173)
ISFP 0.3005 (61/203)
ISTJ 0.2229 (35/157)
ISFJ 0.3493 (51/146)
ENTJ 0.2500 (49/196)
ESFP 0.0000 (0/34)
ESFJ 0.0303 (1/33)
ESTJ 0.0000 (0/30)
ESTP 0.2778 (15/54)


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

Epoch 6 step 100 Acc.: 83.5938 running Loss: 0.6732

Epoch: 6  Training Loss: 0.6321


HBox(children=(FloatProgress(value=0.0, max=55.0), HTML(value='')))


Epoch: 6 Accuracy: 49.9280 Validation Loss: 1.7139 (3465/6940)
Macro f1: 35.4006
Balanced Accuracy: 32.5856
Accuracy by Type: 16 types
ENFJ 0.3453 (48/139)
ENTP 0.4362 (246/564)
INFP 0.7680 (1122/1461)
INTP 0.5226 (556/1064)
ENFP 0.4231 (234/553)
INTJ 0.5327 (464/871)
ISTP 0.3282 (86/262)
INFJ 0.4425 (519/1173)
ISFP 0.2463 (50/203)
ISTJ 0.3503 (55/157)
ISFJ 0.1986 (29/146)
ENTJ 0.1990 (39/196)
ESFP 0.0000 (0/34)
ESFJ 0.2727 (9/33)
ESTJ 0.0000 (0/30)
ESTP 0.1481 (8/54)


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

Epoch 7 step 100 Acc.: 87.5000 running Loss: 0.4538

Epoch: 7  Training Loss: 0.4364


HBox(children=(FloatProgress(value=0.0, max=55.0), HTML(value='')))


Epoch: 7 Accuracy: 49.6686 Validation Loss: 1.7481 (3447/6940)
Macro f1: 36.3837
Balanced Accuracy: 33.9365
Accuracy by Type: 16 types
ENFJ 0.3381 (47/139)
ENTP 0.3511 (198/564)
INFP 0.6735 (984/1461)
INTP 0.6241 (664/1064)
ENFP 0.4955 (274/553)
INTJ 0.5109 (445/871)
ISTP 0.4084 (107/262)
INFJ 0.4348 (510/1173)
ISFP 0.3941 (80/203)
ISTJ 0.2484 (39/157)
ISFJ 0.2740 (40/146)
ENTJ 0.1939 (38/196)
ESFP 0.0000 (0/34)
ESFJ 0.2424 (8/33)
ESTJ 0.0000 (0/30)
ESTP 0.2407 (13/54)


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

Epoch 8 step 100 Acc.: 90.6250 running Loss: 0.3147

Epoch: 8  Training Loss: 0.3124


HBox(children=(FloatProgress(value=0.0, max=55.0), HTML(value='')))


Epoch: 8 Accuracy: 49.4669 Validation Loss: 1.8164 (3433/6940)
Macro f1: 33.5806
Balanced Accuracy: 30.6401
Accuracy by Type: 16 types
ENFJ 0.2014 (28/139)
ENTP 0.3812 (215/564)
INFP 0.6434 (940/1461)
INTP 0.6400 (681/1064)
ENFP 0.4521 (250/553)
INTJ 0.5327 (464/871)
ISTP 0.3435 (90/262)
INFJ 0.5107 (599/1173)
ISFP 0.1724 (35/203)
ISTJ 0.2611 (41/157)
ISFJ 0.2603 (38/146)
ENTJ 0.1837 (36/196)
ESFP 0.0000 (0/34)
ESFJ 0.0606 (2/33)
ESTJ 0.0000 (0/30)
ESTP 0.2593 (14/54)


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

Epoch 9 step 100 Acc.: 96.0938 running Loss: 0.2061

Epoch: 9  Training Loss: 0.2126


HBox(children=(FloatProgress(value=0.0, max=55.0), HTML(value='')))


Epoch: 9 Accuracy: 48.8617 Validation Loss: 1.8855 (3391/6940)
Macro f1: 35.0596
Balanced Accuracy: 33.2651
Accuracy by Type: 16 types
ENFJ 0.3165 (44/139)
ENTP 0.3706 (209/564)
INFP 0.5640 (824/1461)
INTP 0.5968 (635/1064)
ENFP 0.6148 (340/553)
INTJ 0.4351 (379/871)
ISTP 0.4389 (115/262)
INFJ 0.5729 (672/1173)
ISFP 0.2069 (42/203)
ISTJ 0.2484 (39/157)
ISFJ 0.2877 (42/146)
ENTJ 0.1378 (27/196)
ESFP 0.0000 (0/34)
ESFJ 0.2727 (9/33)
ESTJ 0.0000 (0/30)
ESTP 0.2593 (14/54)


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

Epoch 10 step 100 Acc.: 95.3125 running Loss: 0.1149

Epoch: 10  Training Loss: 0.1537


HBox(children=(FloatProgress(value=0.0, max=55.0), HTML(value='')))


Epoch: 10 Accuracy: 49.2795 Validation Loss: 1.9169 (3420/6940)
Macro f1: 34.4814
Balanced Accuracy: 31.3196
Accuracy by Type: 16 types
ENFJ 0.2518 (35/139)
ENTP 0.3918 (221/564)
INFP 0.7070 (1033/1461)
INTP 0.5761 (613/1064)
ENFP 0.4250 (235/553)
INTJ 0.4524 (394/871)
ISTP 0.3931 (103/262)
INFJ 0.5192 (609/1173)
ISFP 0.2956 (60/203)
ISTJ 0.2293 (36/157)
ISFJ 0.3288 (48/146)
ENTJ 0.0867 (17/196)
ESFP 0.0294 (1/34)
ESFJ 0.1212 (4/33)
ESTJ 0.0000 (0/30)
ESTP 0.2037 (11/54)


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

Epoch 11 step 100 Acc.: 99.2188 running Loss: 0.0684

Epoch: 11  Training Loss: 0.1104


HBox(children=(FloatProgress(value=0.0, max=55.0), HTML(value='')))


Epoch: 11 Accuracy: 49.2651 Validation Loss: 1.9544 (3419/6940)
Macro f1: 33.6354
Balanced Accuracy: 30.8648
Accuracy by Type: 16 types
ENFJ 0.2590 (36/139)
ENTP 0.4238 (239/564)
INFP 0.6407 (936/1461)
INTP 0.5216 (555/1064)
ENFP 0.3834 (212/553)
INTJ 0.5924 (516/871)
ISTP 0.3511 (92/262)
INFJ 0.5507 (646/1173)
ISFP 0.2315 (47/203)
ISTJ 0.2293 (36/157)
ISFJ 0.2671 (39/146)
ENTJ 0.2857 (56/196)
ESFP 0.0000 (0/34)
ESFJ 0.0909 (3/33)
ESTJ 0.0000 (0/30)
ESTP 0.1111 (6/54)


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

Epoch 12 step 100 Acc.: 96.8750 running Loss: 0.0857

Epoch: 12  Training Loss: 0.0877


HBox(children=(FloatProgress(value=0.0, max=55.0), HTML(value='')))


Epoch: 12 Accuracy: 48.4870 Validation Loss: 2.0066 (3365/6940)
Macro f1: 34.3858
Balanced Accuracy: 31.9850
Accuracy by Type: 16 types
ENFJ 0.2374 (33/139)
ENTP 0.3848 (217/564)
INFP 0.6119 (894/1461)
INTP 0.5573 (593/1064)
ENFP 0.3653 (202/553)
INTJ 0.5396 (470/871)
ISTP 0.2710 (71/262)
INFJ 0.5644 (662/1173)
ISFP 0.3399 (69/203)
ISTJ 0.3439 (54/157)
ISFJ 0.2534 (37/146)
ENTJ 0.2194 (43/196)
ESFP 0.0000 (0/34)
ESFJ 0.1515 (5/33)
ESTJ 0.0000 (0/30)
ESTP 0.2778 (15/54)


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

Epoch 13 step 100 Acc.: 98.4375 running Loss: 0.0768

Epoch: 13  Training Loss: 0.0687


HBox(children=(FloatProgress(value=0.0, max=55.0), HTML(value='')))


Epoch: 13 Accuracy: 49.3804 Validation Loss: 2.0542 (3427/6940)
Macro f1: 34.2667
Balanced Accuracy: 31.5683
Accuracy by Type: 16 types
ENFJ 0.2878 (40/139)
ENTP 0.4149 (234/564)
INFP 0.6701 (979/1461)
INTP 0.6494 (691/1064)
ENFP 0.3942 (218/553)
INTJ 0.4007 (349/871)
ISTP 0.3740 (98/262)
INFJ 0.5303 (622/1173)
ISFP 0.2857 (58/203)
ISTJ 0.2357 (37/157)
ISFJ 0.2603 (38/146)
ENTJ 0.2398 (47/196)
ESFP 0.0000 (0/34)
ESFJ 0.0303 (1/33)
ESTJ 0.0000 (0/30)
ESTP 0.2778 (15/54)


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

Epoch 14 step 100 Acc.: 97.6562 running Loss: 0.0823

Epoch: 14  Training Loss: 0.0545


HBox(children=(FloatProgress(value=0.0, max=55.0), HTML(value='')))


Epoch: 14 Accuracy: 48.8473 Validation Loss: 2.0699 (3390/6940)
Macro f1: 34.6930
Balanced Accuracy: 32.1147
Accuracy by Type: 16 types
ENFJ 0.3094 (43/139)
ENTP 0.4202 (237/564)
INFP 0.6454 (943/1461)
INTP 0.4605 (490/1064)
ENFP 0.4213 (233/553)
INTJ 0.4845 (422/871)
ISTP 0.3588 (94/262)
INFJ 0.6155 (722/1173)
ISFP 0.2562 (52/203)
ISTJ 0.2293 (36/157)
ISFJ 0.2877 (42/146)
ENTJ 0.3061 (60/196)
ESFP 0.0000 (0/34)
ESFJ 0.1212 (4/33)
ESTJ 0.0000 (0/30)
ESTP 0.2222 (12/54)


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

Epoch 15 step 100 Acc.: 97.6562 running Loss: 0.0703

Epoch: 15  Training Loss: 0.0463


HBox(children=(FloatProgress(value=0.0, max=55.0), HTML(value='')))


Epoch: 15 Accuracy: 49.1499 Validation Loss: 2.0879 (3411/6940)
Macro f1: 34.6359
Balanced Accuracy: 32.7860
Accuracy by Type: 16 types
ENFJ 0.2734 (38/139)
ENTP 0.3954 (223/564)
INFP 0.5914 (864/1461)
INTP 0.5470 (582/1064)
ENFP 0.5027 (278/553)
INTJ 0.5350 (466/871)
ISTP 0.4122 (108/262)
INFJ 0.5499 (645/1173)
ISFP 0.3103 (63/203)
ISTJ 0.2166 (34/157)
ISFJ 0.3356 (49/146)
ENTJ 0.2194 (43/196)
ESFP 0.0000 (0/34)
ESFJ 0.0606 (2/33)
ESTJ 0.0000 (0/30)
ESTP 0.2963 (16/54)


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

Epoch 16 step 100 Acc.: 98.4375 running Loss: 0.0450

Epoch: 16  Training Loss: 0.0406


HBox(children=(FloatProgress(value=0.0, max=55.0), HTML(value='')))


Epoch: 16 Accuracy: 48.6888 Validation Loss: 2.1321 (3379/6940)
Macro f1: 35.2904
Balanced Accuracy: 32.2645
Accuracy by Type: 16 types
ENFJ 0.2734 (38/139)
ENTP 0.3706 (209/564)
INFP 0.7002 (1023/1461)
INTP 0.4850 (516/1064)
ENFP 0.4069 (225/553)
INTJ 0.4914 (428/871)
ISTP 0.3473 (91/262)
INFJ 0.5371 (630/1173)
ISFP 0.3399 (69/203)
ISTJ 0.2866 (45/157)
ISFJ 0.3425 (50/146)
ENTJ 0.1939 (38/196)
ESFP 0.0294 (1/34)
ESFJ 0.1212 (4/33)
ESTJ 0.0333 (1/30)
ESTP 0.2037 (11/54)


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

Epoch 17 step 100 Acc.: 100.0000 running Loss: 0.0273

Epoch: 17  Training Loss: 0.0334


HBox(children=(FloatProgress(value=0.0, max=55.0), HTML(value='')))


Epoch: 17 Accuracy: 49.0778 Validation Loss: 2.1717 (3406/6940)
Macro f1: 33.7347
Balanced Accuracy: 31.1220
Accuracy by Type: 16 types
ENFJ 0.2590 (36/139)
ENTP 0.4645 (262/564)
INFP 0.7029 (1027/1461)
INTP 0.5216 (555/1064)
ENFP 0.3797 (210/553)
INTJ 0.4914 (428/871)
ISTP 0.3511 (92/262)
INFJ 0.5166 (606/1173)
ISFP 0.2808 (57/203)
ISTJ 0.2675 (42/157)
ISFJ 0.2603 (38/146)
ENTJ 0.1990 (39/196)
ESFP 0.0294 (1/34)
ESFJ 0.0000 (0/33)
ESTJ 0.0333 (1/30)
ESTP 0.2222 (12/54)


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

Epoch 18 step 100 Acc.: 100.0000 running Loss: 0.0202

Epoch: 18  Training Loss: 0.0281


HBox(children=(FloatProgress(value=0.0, max=55.0), HTML(value='')))


Epoch: 18 Accuracy: 49.3948 Validation Loss: 2.1647 (3428/6940)
Macro f1: 34.7085
Balanced Accuracy: 32.6087
Accuracy by Type: 16 types
ENFJ 0.2806 (39/139)
ENTP 0.4610 (260/564)
INFP 0.6420 (938/1461)
INTP 0.5094 (542/1064)
ENFP 0.4593 (254/553)
INTJ 0.5270 (459/871)
ISTP 0.4160 (109/262)
INFJ 0.5431 (637/1173)
ISFP 0.2808 (57/203)
ISTJ 0.3248 (51/157)
ISFJ 0.2603 (38/146)
ENTJ 0.1327 (26/196)
ESFP 0.0000 (0/34)
ESFJ 0.1212 (4/33)
ESTJ 0.0000 (0/30)
ESTP 0.2593 (14/54)


HBox(children=(FloatProgress(value=0.0, max=163.0), HTML(value='')))

Epoch 19 step 100 Acc.: 99.2188 running Loss: 0.0278


KeyboardInterrupt: ignored

In [None]:
torch.save(model.state_dict(), "drive/My Drive/mbti_cnn_model.pt")

In [None]:
model.load_state_dict(torch.load('drive/My Drive/mbti_aug_person_cnn_model_6.pt'))
model.eval()

CNN(
  (embedding): Embedding(139587, 256, padding_idx=1)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(1, 256), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(2, 256), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(3, 256), stride=(1, 1))
    (3): Conv2d(1, 100, kernel_size=(4, 256), stride=(1, 1))
    (4): Conv2d(1, 100, kernel_size=(5, 256), stride=(1, 1))
  )
  (fc): Linear(in_features=500, out_features=18, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [None]:
all_preds = []
all_targets = []
model.eval()
corrects = num_test = 0
for batch in tqdm(test_iter):
    preds = model(batch.text)
    target = batch.label.squeeze()
    corrects += (torch.max(preds, 1)[1].view(target.size()).data == target.data).sum()
    num_test += batch.batch_size
    all_preds.extend(torch.max(preds, 1)[1].view(target.size()).data.tolist())
    all_targets.extend(target.data.tolist())

accuracy = 100.0 * corrects/num_test
balanced_accuracy, type_accuracy, type_correct, type_gold, macro_f1 = get_balanced_accuracy(all_targets, all_preds)
print('Epoch: {} Accuracy: {:.4f} ({}/{})'.format(epoch, accuracy, corrects, num_test))
print('Macro f1: {:.4f}'.format(macro_f1 * 100))
print('Balanced Accuracy: {:.4f}'.format(balanced_accuracy * 100))
print('Accuracy by Type: {} types'.format(len(type_accuracy.keys())))
for type in type_accuracy.keys():
  print('{} {:.4f} ({}/{})'.format(LABEL.vocab.itos[type], type_accuracy[type], type_correct[type], type_gold[type]))

HBox(children=(FloatProgress(value=0.0, max=55.0), HTML(value='')))


Epoch: 19 Accuracy: 50.0288 (3472/6940)
Macro f1: 35.9224
Balanced Accuracy: 32.8759
Accuracy by Type: 16 types
ENFP 0.4627 (248/536)
INTJ 0.4936 (424/859)
ESFJ 0.2593 (7/27)
ISTP 0.3320 (81/244)
INTP 0.5103 (545/1068)
INFP 0.7861 (1187/1510)
INFJ 0.4291 (484/1128)
ENTP 0.4394 (250/569)
ISTJ 0.4204 (66/157)
ISFP 0.2018 (44/218)
ENFJ 0.3472 (50/144)
ISFJ 0.2132 (29/136)
ENTJ 0.2567 (48/187)
ESTP 0.1084 (9/83)
ESTJ 0.0000 (0/32)
ESFP 0.0000 (0/42)


In [None]:
def forward_with_sigmoid(input):
    return torch.sigmoid(model(input))
lig = LayerIntegratedGradients(model, model.embedding)
token_reference = TokenReferenceBase(reference_token_idx=TEXT.vocab.stoi['<pad>'])

In [None]:
# accumalate couple samples in this array for visualization purposes
vis_data_records_ig = []

def interpret_sentence(model, sentence, min_len = 64, label = 0):
    text = [tok.text for tok in nlp.tokenizer(sentence)]
    if len(text) < min_len:
        text += ['<pad>'] * (min_len - len(text))
    indexed = [TEXT.vocab.stoi[t] for t in text]

    model.zero_grad()

    input_indices = torch.tensor(indexed, device=device)
    input_indices = input_indices.unsqueeze(0)
    
    # input_indices dim: [sequence_length]
    seq_length = max(min_len, len(text))

    # predict
    preds = forward_with_sigmoid(input_indices)
    pred, pred_ind = torch.max(preds, 1)
    pred = pred.data.tolist()[0]
    pred_ind = pred_ind.data.tolist()[0]
    #print(pred_ind)
    #print(pred)

    # generate reference indices for each sample
    reference_indices = token_reference.generate_reference(seq_length, device=device).unsqueeze(0)

    # compute attributions and approximation delta using layer integrated gradients
    attributions_ig, delta = lig.attribute(input_indices, reference_indices, target=2, \
                                           n_steps=500, return_convergence_delta=True)

    print('pred: ', LABEL.vocab.itos[pred_ind], '(', '%.2f'%pred, ')', ', delta: ', abs(delta))

    add_attributions_to_visualizer(attributions_ig, text, pred, pred_ind, label, delta, vis_data_records_ig)
    
def add_attributions_to_visualizer(attributions, text, pred, pred_ind, label, delta, vis_data_records):
    attributions = attributions.sum(dim=2).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    attributions = attributions.cpu().detach().numpy()

    # storing couple samples in an array for visualization purposes
    vis_data_records.append(visualization.VisualizationDataRecord(
                            attributions,
                            pred,
                            LABEL.vocab.itos[pred_ind],
                            LABEL.vocab.itos[label],
                            LABEL.vocab.itos[1],
                            attributions.sum(),       
                            text,
                            delta))

In [None]:
del valid_iter

In [None]:
torch.cuda.empty_cache() 

In [None]:
vis_data_records_ig = []
speech_martin_luther_king = "i am happy to join with you today in what will go down in history as the greatest demonstration for freedom in the history of our nation . five score years ago , a great american , in whose symbolic shadow we stand today , signed the emancipation proclamation . this momentous decree came as a great beacon light of hope to millions of negro slaves who had been seared in the flames of withering injustice . it came as a joyous daybreak to end the long night of their captivity . but one hundred years later , the negro still is not free . one hundred years later , the life of the negro is still sadly crippled by the manacles of segregation and the chains of discrimination . one hundred years later , the negro lives on a lonely island of poverty in the midst of a vast ocean of material prosperity . one hundred years later , the negro is still languished in the corners of american society and finds himself an exile in his own land . and so we 've come here today to dramatize a shameful condition . in a sense we 've come to our nation 's capital to cash a check . when the architects of our republic wrote the magnificent words of the constitution and the declaration of independence , they were signing a promissory note to which every american was to fall heir . this note was a promise that all men , yes , black men as well as white men , would be guaranteed the \" unalienable rights \" of \" life , liberty and the pursuit of happiness . \" it is obvious today that america has defaulted on this promissory note , insofar as her citizens of color are concerned . instead of honoring this sacred obligation , america has given the negro people a bad check , a check which has come back marked \" insufficient funds . \" but we refuse to believe that the bank of justice is bankrupt . we refuse to believe that there are insufficient funds in the great vaults of opportunity of this nation . and so , we 've come to cash this check , a check that will give us upon demand the riches of freedom and the security of justice . we have also come to this hallowed spot to remind america of the fierce urgency of now . this is no time to engage in the luxury of cooling off or to take the tranquilizing drug of gradualism . now is the time to make real the promises of democracy . now is the time to rise from the dark and desolate valley of segregation to the sunlit path of racial justice . now is the time to lift our nation from the quicksands of racial injustice to the solid rock of brotherhood . now is the time to make justice a reality for all of god 's children . it would be fatal for the nation to overlook the urgency of the moment . this sweltering summer of the negro 's legitimate discontent will not pass until there is an invigorating autumn of freedom and equality . nineteen sixty-three is not an end , but a beginning . and those who hope that the negro needed to blow off steam and will now be content will have a rude awakening if the nation returns to business as usual . and there will be neither rest nor tranquility in america until the negro is granted his citizenship rights . the whirlwinds of revolt will continue to shake the foundations of our nation until the bright day of justice emerges . but there is something that i must say to my people , who stand on the warm threshold which leads into the palace of justice: in the process of gaining our rightful place , we must not be guilty of wrongful deeds . let us not seek to satisfy our thirst for freedom by drinking from the cup of bitterness and hatred . we must forever conduct our struggle on the high plane of dignity and discipline . we must not allow our creative protest to degenerate into physical violence . again and again , we must rise to the majestic heights of meeting physical force with soul force . the marvelous new militancy which has engulfed the negro community must not lead us to a distrust of all white people , for many of our white brothers , as evidenced by their presence here today , have come to realize that their destiny is tied up with our destiny . and they have come to realize that their freedom is inextricably bound to our freedom . we cannot walk alone . and as we walk , we must make the pledge that we shall always march ahead . we cannot turn back . there are those who are asking the devotees of civil rights , \" when will you be satisfied? \" we can never be satisfied as long as the negro is the victim of the unspeakable horrors of police brutality . we can never be satisfied as long as our bodies , heavy with the fatigue of travel , cannot gain lodging in the motels of the highways and the hotels of the cities . **we cannot be satisfied as long as the negro 's basic mobility is from a smaller ghetto to a larger one . we can never be satisfied as long as our children are stripped of their self-hood and robbed of their dignity by signs stating: \" for whites only . \" ** we cannot be satisfied as long as a negro in mississippi cannot vote and a negro in new york believes he has nothing for which to vote . no , no , we are not satisfied , and we will not be satisfied until \" justice rolls down like waters , and righteousness like a mighty stream . \" 1 i am not unmindful that some of you have come here out of great trials and tribulations . some of you have come fresh from narrow jail cells . and some of you have come from areas where your quest -- quest for freedom left you battered by the storms of persecution and staggered by the winds of police brutality . you have been the veterans of creative suffering . continue to work with the faith that unearned suffering is redemptive . go back to mississippi , go back to alabama , go back to south carolina , go back to georgia , go back to louisiana , go back to the slums and ghettos of our northern cities , knowing that somehow this situation can and will be changed . let us not wallow in the valley of despair , i say to you today , my friends . and so even though we face the difficulties of today and tomorrow , i still have a dream . it is a dream deeply rooted in the american dream . i have a dream that one day this nation will rise up and live out the true meaning of its creed: \" we hold these truths to be self-evident , that all men are created equal . \" i have a dream that one day on the red hills of georgia , the sons of former slaves and the sons of former slave owners will be able to sit down together at the table of brotherhood . i have a dream that one day even the state of mississippi , a state sweltering with the heat of injustice , sweltering with the heat of oppression , will be transformed into an oasis of freedom and justice . i have a dream that my four little children will one day live in a nation where they will not be judged by the color of their skin but by the content of their character . i have a dream today ! i have a dream that one day , down in alabama , with its vicious racists , with its governor having his lips dripping with the words of \" interposition \" and \" nullification \" -- one day right there in alabama little black boys and black girls will be able to join hands with little white boys and white girls as sisters and brothers . i have a dream today ! i have a dream that one day every valley shall be exalted , and every hill and mountain shall be made low , the rough places will be made plain , and the crooked places will be made straight; \" and the glory of the lord shall be revealed and all flesh shall see it together . \" 2 this is our hope , and this is the faith that i go back to the south with . with this faith , we will be able to hew out of the mountain of despair a stone of hope . with this faith , we will be able to transform the jangling discords of our nation into a beautiful symphony of brotherhood . with this faith , we will be able to work together , to pray together , to struggle together , to go to jail together , to stand up for freedom together , knowing that we will be free one day . and this will be the day -- this will be the day when all of god 's children will be able to sing with new meaning: my country 'tis of thee , sweet land of liberty , of thee i sing . land where my fathers died , land of the pilgrim 's pride , from every mountainside , let freedom ring ! and if america is to be a great nation , this must become true . and so let freedom ring from the prodigious hilltops of new hampshire . let freedom ring from the mighty mountains of new york . let freedom ring from the heightening alleghenies of pennsylvania . let freedom ring from the snow-capped rockies of colorado . let freedom ring from the curvaceous slopes of california . but not only that: let freedom ring from stone mountain of georgia . let freedom ring from lookout mountain of tennessee . let freedom ring from every hill and molehill of mississippi . from every mountainside , let freedom ring . and when this happens , and when we allow freedom ring , when we let it ring from every village and every hamlet , from every state and every city , we will be able to speed up that day when all of god 's children , black men and white men , jews and gentiles , protestants and catholics , will be able to join hands and sing in the words of the old negro spiritual: free at last ! free at last ! thank god almighty , we are free at last ! "
speech_lincoln= "four score and seven years ago our fathers brought forth on this continent , a new nation , conceived in liberty , and dedicated to the proposition that all men are created equal . now we are engaged in a great civil war , testing whether that nation , or any nation so conceived and so dedicated , can long endure . we are met on a great battle-field of that war . we have come to dedicate a portion of that field , as a final resting place for those who here gave their lives that that nation might live . it is altogether fitting and proper that we should do this . but , in a larger sense , we can not dedicate -- we can not consecrate -- we can not hallow -- this ground . the brave men , living and dead , who struggled here , have consecrated it , far above our poor power to add or detract . the world will little note , nor long remember what we say here , but it can never forget what they did here . it is for us the living , rather , to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced . it is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation , under god , shall have a new birth of freedom -- and that government of the people , by the people , for the people , shall not perish from the earth ."
goethe = "treat people as if they were what they ought to be and you help them to become what they are capable of being ."
jrr_tolkin = "all that is gold does not glitter ; not all those who wander are lost ; the old that is strong does not wither ; deep roots are not reached by the frost ."
pascal = "thought constitutes the greatness of man . man is a reed , the feeblest thing in nature , but he is a thinking reed ."

In [None]:
interpret_sentence(model, speech_martin_luther_king, label=LABEL.vocab.stoi["INFJ"])
interpret_sentence(model, speech_lincoln, label=LABEL.vocab.stoi["ENFJ"])
interpret_sentence(model, goethe, label=LABEL.vocab.stoi["INFJ"])
interpret_sentence(model, jrr_tolkin, label=LABEL.vocab.stoi["INFP"])
interpret_sentence(model, pascal, label=LABEL.vocab.stoi["INTJ"])

In [None]:
intj_comment_1 = "this is so accurate that it borderlines creepy lol . I know I 'm weird and have my own flaws and quite often hard to handle , but I 've never really put them down to words and I thought it would be impossible to describe myself. but seeing all these got laid down to this extent of details , I am absolutely impressed and amazed . also I finally get to know why I always connect so well with those incredible villians in the movies and tvs ( Little Finger OMG I so love him ! ) lol ."
intj_comment_2 = "This test is crazy and I'm feeling a bit doubtful. How can this one test result describe my personality so well? I've always been ashamed about who I am, my judgement towards other people's actions and my way to analyze everything that I experience. These results made me realize, that there are so many things considering this type of personality I must be proud of. This helped me a lot in the process of accepting myself and learning to know my strengths ."
interpret_sentence(model, intj_comment_1, label=LABEL.vocab.stoi["INTJ"])
interpret_sentence(model, intj_comment_2, label=LABEL.vocab.stoi["INTJ"])


In [None]:
print('Visualize attributions based on Integrated Gradients')
visualization.visualize_text(vis_data_records_ig)


In [None]:
vis_data_records_ig = []
for data in tqdm(valid_data):
  interpret_sentence(model, ' '.join(data.text), label=LABEL.vocab.stoi[data.label[0]])
  break
visualization.visualize_text(vis_data_records_ig)
