In [None]:
!git clone https://github.com/l1905kw/nlp-class-project.git

fatal: destination path 'nlp-class-project' already exists and is not an empty directory.


In [None]:
!python -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [None]:
!pip install captum



In [None]:
import spacy

import torch
import torchtext
import torchtext.data
import torch.nn as nn
import torch.nn.functional as F

from torchtext.vocab import Vocab

from captum.attr import LayerIntegratedGradients, TokenReferenceBase, visualization

nlp = spacy.load('en')


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        embedded = self.embedding(text)
        #embedded = [batch size, sent len, emb dim]
        embedded = embedded.unsqueeze(1)
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
                
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))
        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [None]:
TEXT = torchtext.data.Field(lower=True, tokenize=str.split, use_vocab=True, batch_first=True)
# E vs. I
#LABEL = torchtext.data.Field(is_target=True, preprocessing=lambda x: 0 if x[0]=='I' else 1)
# 16 classes
LABEL = torchtext.data.Field(use_vocab=True, is_target=True, batch_first=True)

In [None]:
from torchtext.data import TabularDataset

train_data, valid_data, test_data = TabularDataset.splits(path='nlp-class-project/preprocess_new',
                                   #train='personal_data_aug_train.tsv',
                                   train='original_train/mbti.tsv',
                                   #validation='split_val/mbti.tsv',
                                   #train='original_train/mbti.tsv',
                                   validation='original_val/mbti.tsv',
                                   #test='personal_data_aug_test.tsv',
                                   test='original_test/mbti.tsv',
                                   format='tsv',
                                   fields=[('label', LABEL), ('text', TEXT)])

In [None]:
TEXT.build_vocab(train_data)

# 16 classes
LABEL.build_vocab(train_data)
output_dim = len(LABEL.vocab)

#print(len(LABEL.vocab))


In [None]:
model = CNN(vocab_size = len(TEXT.vocab), 
            embedding_dim = 256,
            n_filters = 100,
            filter_sizes = [1,2,3,4,5],
            output_dim = output_dim,
            dropout=0.1,
            pad_idx = TEXT.vocab.stoi['<pad>'])
model.cuda()

CNN(
  (embedding): Embedding(122976, 256, padding_idx=1)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(1, 256), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(2, 256), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(3, 256), stride=(1, 1))
    (3): Conv2d(1, 100, kernel_size=(4, 256), stride=(1, 1))
    (4): Conv2d(1, 100, kernel_size=(5, 256), stride=(1, 1))
  )
  (fc): Linear(in_features=500, out_features=18, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [None]:
model.load_state_dict(torch.load('drive/My Drive/mbti_process_new_person_cnn_model_7.pt'))
model.eval()

CNN(
  (embedding): Embedding(122976, 256, padding_idx=1)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(1, 256), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(2, 256), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(3, 256), stride=(1, 1))
    (3): Conv2d(1, 100, kernel_size=(4, 256), stride=(1, 1))
    (4): Conv2d(1, 100, kernel_size=(5, 256), stride=(1, 1))
  )
  (fc): Linear(in_features=500, out_features=18, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [None]:
def forward_with_sigmoid(input):
    return torch.sigmoid(model(input))
lig = LayerIntegratedGradients(model, model.embedding)
token_reference = TokenReferenceBase(reference_token_idx=TEXT.vocab.stoi['<pad>'])

In [None]:
# accumalate couple samples in this array for visualization purposes
vis_data_records_ig = []

def interpret_sentence(model, sentence, min_len = 64, label = 0):
    #text = [tok.text for tok in nlp.tokenizer(sentence)]
    text = [tok for tok in str.split(sentence)]
    if len(text) < min_len:
        text += ['<pad>'] * (min_len - len(text))
    indexed = [TEXT.vocab.stoi[t] for t in text]

    model.zero_grad()

    input_indices = torch.tensor(indexed, device=device)
    input_indices = input_indices.unsqueeze(0)
    
    # input_indices dim: [sequence_length]
    seq_length = max(min_len, len(text))

    # predict
    preds = forward_with_sigmoid(input_indices)
    pred, pred_ind = torch.max(preds, 1)
    pred = pred.data.tolist()[0]
    pred_ind = pred_ind.data.tolist()[0]
    #print(pred_ind)
    #print(pred)

    # generate reference indices for each sample
    reference_indices = token_reference.generate_reference(seq_length, device=device).unsqueeze(0)

    # compute attributions and approximation delta using layer integrated gradients
    attributions_ig, delta = lig.attribute(input_indices, reference_indices, target=label, \
                                           n_steps=50, return_convergence_delta=True)

    #print('pred: ', LABEL.vocab.itos[pred_ind], '(', '%.2f'%pred, ')', ', delta: ', abs(delta))

    #add_attributions_to_visualizer(attributions_ig, text, pred, pred_ind, label, delta, vis_data_records_ig)
    return attributions_ig
    
def add_attributions_to_visualizer(attributions, text, pred, pred_ind, label, delta, vis_data_records):
    attributions = attributions.sum(dim=2).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    attributions = attributions.cpu().detach().numpy()

    # storing couple samples in an array for visualization purposes
    vis_data_records.append(visualization.VisualizationDataRecord(
                            attributions,
                            pred,
                            LABEL.vocab.itos[pred_ind],
                            LABEL.vocab.itos[label],
                            LABEL.vocab.itos[1],
                            attributions.sum(),       
                            text,
                            delta))

In [None]:
data = sorted(test_data, key=lambda x : len(x.text))[:50]
for d in data[:10]:
  interpret_sentence(model, ' '.join(d.text), label=LABEL.vocab.stoi[d.label[0]])

In [None]:
visualization.visualize_text(vis_data_records_ig)

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
INTP,INTP (0.88),,1.56,"most people are catholic because they have been taught by their relatives that it 's a way of life . as kids , they 'll be brought to church every sunday ... or else . some may really believe the doctrine ... sep i wanted to take over the world using genetically modified plants grown hydroponically sent from my a33fw using tapatalk sep wondering if nokia would have more sales if they used android os on their phones . marketed for durability and equipped with one of the most commonly used os 's . then again this is pretty much what put ... sep esfj . definitely some strong fe vibes that i got from your type me post . sent from my a33fw using tapatalk sep hello :waving_hand: sent from my a33fw using tapatalk sep ah wait . was that plaza and not lounge ? sent from my a33fw using tapatalk sep hello there sent from my a33fw using tapatalk sep why not make an introduction ? wo n't hurt much . i picked up mbti a years ago after getting personality tests from a guidance counselor . so far , i 've been staying in the entertainment lounge for a ... sep i 'll take a buddy ... or a body , if possible . sep face augmentation except you insert balloons into a person 's face and then inflate them so that their face blows up . hey , that sounds like it 'll be great for the future of cosmetic surgery . cheap ... sep either you 're comatose or a vegetable ."
,,,,
INFP,INFP (0.90),,1.42,"' httpurl sep all the time . for as long as i can remember . as a kid i was in gymnastics , ballet , soccer , piano lessons , you name it . never stuck with any of them . =/ sep ravenclaw :p roud :sep httpurl gets me every time . she is adorable .: kitteh :sep httpurl sep httpurl sep marsh sep httpurl sep oh yes , the past can hurt . but you can either run from it , or learn from it . - rafiki , from the lion king ... sep just finishing up the catcher in the rye . and a few chapters into the eye of the world by robert jordan . sep 1 . for a person you loved deeply , would you be willing to move to a distant country knowing there would be little chance of seeing your friends or family again ? yes . i love my family but i 'd ... sep 1 . you have the chance to meet someone with whom you have the most satisfying love imaginable - the stuff of dreams . sadly , you know that in six months the person will die . knowing the pain that ... sep i 'll do it =] sep thanks ! sep 7563275633 7563475635 75636my doggy <3 sep daydreaming . always having my head in the clouds . sep 7556675567 7556875569 75570 sep hi ! i 'm new here , so i thought i 'd introduce my self with a picture of my goofy face .: happy :75565sorry it 's up side down , i was too lazy to fix it .: kitteh : '"
,,,,
ISTP,INTP (0.91),,-0.1,"' httpurl sep httpurl sep httpurl sep httpurl sep pay pay the price pay for nothing 's fair sep httpurl sep httpurl sep youtube - status quo - whatever you want hq lyrics 70s sep httpurl sep youtube - audioslave hypnotize sep youtube - the kills - u r a fever sep youtube - audioslave - be yourself sep youtube - deep purple-burn sep httpurl sep httpurl sep httpurl sep youtube - the runaways - is it day or night sep httpurl sep httpurl sep opening credits :feeling a moment-feeder waking up: arizona-kings of leon first day at school :the unforgiven ii - metallica falling in love: painted on my heart-the cult losing virginity : ... sep httpurl sep httpurl sep httpurl sep httpurl sep httpurl sep httpurl sep httpurl sep youtube - linkin park - crawling sep httpurl sep there 's a fine line what you want and what you need standing right there in between sep and though it 's been a long time , you 're right back where you started from i see it in your eyes that now youx 92re giving up the gun sep httpurl sep httpurl sep httpurl sep httpurl sep youtube - florence and the machine - girl with one eye sep youtube - lacuna coil - heaven 's a lie ( version 3 ) [ official video ] hd + lyrics sep httpurl sep httpurl sep httpurl sep youtube - k 's choice - not an addict ( european version ) sep youtube - depeche mode - precious sep youtube - depeche mode - enjoy the silence original ( not live ) sep hogfather by terry pratchett sep youtube - nirvana - you know you 're right sep youtube - red hot chili peppers - californication official music video sep youtube - wonderwall sep youtube - keane - everybody 's changing sep httpurl sep 1.radiohead - climbing up the walls 2.florence and the machine - between two lungs 3 . t . a . t . u-show me love 4.florence and the machine-rabbit heart 5.ram jam-black betty 6 . t . a . t . u-loves me not ... '"
,,,,
ISTP,ISTP (0.97),,0.78,"' banned for hiding alcohol in that cup :shocked: sep banned because god knows what are you up with that straw or whatever it is . sep if cautioness can be detached from our or any concept of time , space or any outside perception of senses , and if it can be static and and just be what purpose doas it serve in that state ? sep 47589 sep i agree with that statement , our perception of time is ilusion , the real time i would call continuity that keeps our cautioness and cautioness in general alive . let me give an example , as ... sep ha what if you guys try to convince somebody that you'r crazy and than give that in trance look they might interprate it like death stare :crazy: sep agreement sep i was in the same dilema and i chose ee because it offerd more broader practical knowledge , i got to learn anything from microcontrolers to generators / motors . there was ton of lab work which i realy ... sep malfuntion sep what about istp stare of death , if that is n't scary the do notice t-shirt and look again :crazy: 47375 47377 sep scent sep weight sep httpurl sep httpurl sep pissed off , somebody poisoned my cat , good thing i noticed it on time sep when i was a baby they used to call me walker maniac lol , my parrent told me i was fearless with that thing , at the age of 4-5 i found my brothers drilling toy and started driling a knuf wall ... sep 1 ) touch - 10 2 ) vision - 9.5 3 ) hearing - 8 4 ) taste - 4 5 ) scent - 4 sep it can be harder to determinete t / f preference when they are 2 . and 3 . function , in that case it would be good idea to determninate what kind of thinking and what kind of feeling doas a person use sep yeah i saw that too , they can be like this especially when they get enthusiastic about some topic . i think its the need to express their point of view on a subject based on what they saw and ... '"
,,,,
ESTP,INFJ (0.85),,0.52,"' httpurl sep browsing perc , remembering the old times , listening to old music that i used to love and probably worried , but not enough , that i have n't studied for tomorrow . sep httpurl sep httpurl sep httpurl sep dream on dreamer - yourself as someone else httpurl sep waiting , dreaming , watching tv :rolleyes: sep httpurl httpurl httpurl ... sep httpurl sep httpurl httpurl httpurl ... sep good food . sep httpurl httpurl httpurl ... sep httpurl sep httpurl sep sent :omg . i took another version of mbti test and it came out that i 'm still an estp and my s and t are exactly the same , almost 100 % . received: yes but your s and n were almost the same . you ... sep httpurl sep httpurl sep httpurl sep httpurl httpurl httpurl ... sep httpurl sep httpurl can't stop ; it 's addictive sep httpurl show me some attitude . sep httpurl httpurl sep httpurl sep httpurl see'ya . sep httpurl httpurl httpurl ... sep received :i meant x is such a feeler . sent: * cough , cough * after me * cough , cough * sep now that was awkward . sep httpurl httpurl httpurl sep httpurl sep httpurl i never knew i was a techno fan . sep baby gangsta ? :tongue: sep dancing like my vagina is on fire inside my head , because in reality i can't ; i 'm not alone in the room :d ry :sep httpurl show me a funny gif image . sep httpurl sep httpurl sep nada . simple as that . sep httpurl after lots of searches ( forever traumatized ) , i decided to post this one . show me colors . sep i woke up this morning and i started jumping the chord doing 1,000 jumping ( again ) . me is so proud . the circle begins again , this night: laughing : sep httpurl httpurl ... sep incredibly stressed out , thanks for asking . sep httpurl sep httpurl yeah , that 's how bad i am right now . sep pet sematary by stephen king . sep httpurl httpurl httpurl httpurl sep httpurl sep httpurl sep httpurl again . sep i discovered a new band today and i love it . sep i move along because the all-american rejects tell me so . '"
,,,,


In [None]:
from tqdm.notebook import tqdm

vis_data_records_ig = []
mbti_word_importance = {}
mbti_word_occurence = {}
for i in LABEL.vocab.stoi:
  mbti_word_importance[i] = {}
  mbti_word_occurence[i] = {}

for data in tqdm(valid_data):
  attr = interpret_sentence(model, ' '.join(data.text), label=LABEL.vocab.stoi[data.label[0]])

  attr_sum = torch.sum(attr, dim=2).squeeze().tolist()
  for i, w in enumerate(data.text):
    mbti_word_importance[data.label[0]][w] = mbti_word_importance[data.label[0]].get(w, 0) + attr_sum[i]
    mbti_word_occurence[data.label[0]][w] = mbti_word_occurence[data.label[0]].get(w, 0) + 1

HBox(children=(FloatProgress(value=0.0, max=867.0), HTML(value='')))




In [None]:

import pickle
with open('drive/My Drive/mbti_aug_analysis_result.pt', 'wb') as f:
    pickle.dump({'word_importance': mbti_word_importance, 'word_occurence': mbti_word_occurence}, f, pickle.HIGHEST_PROTOCOL)

'''
with open('drive/My Drive/mbti_analysis_result.pt', 'rb') as f:
    data = pickle.load(f)
'''

In [None]:
import pandas as pd

df = pd.DataFrame()
for t, value in sorted(mbti_word_importance.items()):
  if t == '<unk>' or t== '<pad>':
    continue
  w = list(value.items())
  sorted_words = sorted(w, key=lambda s: s[1], reverse=True)[:50]

  if(len(sorted_words)) < 50:
    continue

  df[t] = [w[0] for w in sorted_words]
  #df[t + '_importance'] = [w[1] for w in sorted_words]
  #df[t + '_occurence'] = [mbti_word_importance[t][w[0]] for w in sorted_words]
  #df[t + '_norm_importance'] = [w[1] / mbti_word_occurence[t][w[0]] for w in sorted_words]

display(df)
df.to_csv("word_importance.csv")
  #print(t, sorted_words)
  #most_importance_words = sorted([w for w, importance in importances.item()])

Unnamed: 0,ENFJ,ENFP,ENTJ,ENTP,ESFJ,ESFP,ESTJ,ESTP,INFJ,INFP,INTJ,INTP,ISFJ,ISFP,ISTJ,ISTP
0,enfj,enfp,entj,entp,intps,esfp,estj,estp,infj,infp,intj,intp,isfj,isfp,istj,istp
1,!,!,entjs,entps,isfj,!,istj,estps,infjs,infps,intjs,intps,fe,!,istjs,istps
2,enfjs,enfps,an,the,welcome,?,an,fun,feel,feel,.,),an,feel,happy,.
3,feel,an,intj,an,esfp,:d,enfp,an,!,really,),would,feel,really,you,","
4,an,feel,mbti,you,intp,typed,istp,istp,my,feeling,?,the,!,),posting,'ll
5,infj,i,.,?,'m,intj,not,shit,-,and,not,is,would,music,mbti,really
6,all,fun,not,fun,from,her,all,got,you,!,by,",",like,'m,the,shit
7,really,:d,shit,sep,as,enfp,!,car,se,",",are,sep,entj,'ll,rant,maybe
8,time,love,sep,shit,maybe,'m,infp,type,life,-,'s,use,her,istp,being,got
9,their,lol,be,'m,!,entp,by,:),fe,music,would,all,?,feelings,back,fuck


In [None]:
norm_word_importance = {}
norm_df = pd.DataFrame()
for i in LABEL.vocab.stoi:
  norm_word_importance[i] = {}
for t, value in mbti_word_importance.items():
  if t == '<unk>' or t== '<pad>':
    continue
  for w, importance in value.items():
    norm_word_importance[t][w] = importance / mbti_word_occurence[t][w]
  
  w = list(norm_word_importance[t].items())
  sorted_words = sorted(w, key=lambda s: s[1], reverse=True)[:50]
  norm_df[t] = [w[0] for w in sorted_words]
  #norm_df[t + '_importance'] = [w[1] for w in sorted_words]
display(norm_df)
norm_df.to_csv("norm_word_importance.csv")

In [None]:
import math

def consine_similarity(word_importance_1, word_importance_2):
  denominator_1 = denominator_2 = nominator = 0
  word_set = set(list(word_importance_1.keys()) + list(word_importance_2.keys()))
  for w in word_set:
    nominator += word_importance_1.get(w, 0) * word_importance_2.get(w, 0)
    denominator_1 += word_importance_1.get(w, 0) * word_importance_1.get(w, 0)
    denominator_2 += word_importance_2.get(w, 0) * word_importance_2.get(w, 0)
  denominator = (math.sqrt(denominator_1) * math.sqrt(denominator_2))
  if denominator == 0:
    return 0
  return nominator / denominator

print(consine_similarity({'a': 0, 'b': 1, 'c': 1, 'd': 1}, {'a': 1, 'b': 0, 'c': 1, 'd': 1})) #0.67
print(consine_similarity({'a': 1.0, 'b': -1.0}, {'a': 1.0, 'b': 0.5})) #0.3162
print(consine_similarity({'a': 1.0, 'b': -1.0}, {'a': 1.0, 'b': -1.0})) #1.0

In [None]:
similarities = []
top_sim = []
types = []
for t, value in sorted(mbti_word_importance.items()):
  if t == '<unk>' or t== '<pad>':
    continue
  types.append(t)
  similarity = []
  for t2, value2 in sorted(mbti_word_importance.items()):
    if t2 == '<unk>' or t2== '<pad>':
      continue
    s = consine_similarity(value, value2)
    similarity.append()
  similarities.append(similarity)

sim_df = pd.DataFrame(similarities, columns=types, index=types)
display(sim_df)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(16, 16))
sns.heatmap(sim_df, linewidths=0.1, vmax=0.4,  cmap="YlGnBu",
            linecolor='white', annot=True)
plt.show()

In [None]:
sim_for_common_letters = {}
num_common_letters = {}

for i, sims in enumerate(similarities):
  for j, sim in enumerate(sims):
    num_common_char = len(set(types[i]).intersection(types[j]))
    sim_for_common_letters[num_common_char] = sim_for_common_letters.get(num_common_char, 0) + sim
    num_common_letters[num_common_char] = num_common_letters.get(num_common_char, 0) + 1

print('Common letters', 'Avg. Similarity', sep='\t')
for common_letter, sum_sim in sorted(sim_for_common_letters.items(), reverse=True):
  if common_letter == 4:
    continue
  print('{}\t{}'.format(common_letter, sum_sim / num_common_letters[common_letter]))

In [None]:
similarities = []
types = []
for t, value in sorted(norm_word_importance.items()):
  if t == '<unk>' or t== '<pad>':
    continue
  types.append(t)
  similarity = []
  for t2, value2 in sorted(norm_word_importance.items()):
    if t2 == '<unk>' or t2== '<pad>':
      continue
    
    similarity.append(consine_similarity(value, value2))
  similarities.append(similarity)

sim_df = pd.DataFrame(similarities, columns=types, index=types)
display(sim_df)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(16, 16))
sns.heatmap(sim_df, linewidths=0.1, vmax=0.1,  cmap="YlGnBu",
            linecolor='white', annot=True)
plt.show()

In [None]:
sim_for_common_letters = {}
num_common_letters = {}

for i, sims in enumerate(similarities):
  for j, sim in enumerate(sims):
    num_common_char = len(set(types[i]).intersection(types[j]))
    sim_for_common_letters[num_common_char] = sim_for_common_letters.get(num_common_char, 0) + sim
    num_common_letters[num_common_char] = num_common_letters.get(num_common_char, 0) + 1

print('Common letters', 'Avg. Similarity', sep='\t')
for common_letter, sum_sim in sorted(sim_for_common_letters.items(), reverse=True):
  if common_letter == 4:
    continue
  print('{}\t{}'.format(common_letter, sum_sim / num_common_letters[common_letter]))