# Get weights from last layer of attention

In [1]:
import torch
import pandas as pd
import pickle as pkl
from module.evaluate import load_dev_labels, get_metrics
import numpy as np

import torch
from model.highlight_ha import HierarchicalAttPredictorHL
from config.basic_config import configs as config
from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
import random
from module.preprocessor import EnglishPreProcessor
from pytorch_pretrained_bert.tokenization import BertTokenizer
import json
from torchmoji.sentence_tokenizer import SentenceTokenizer
from torch.utils.data import Dataset, DataLoader
from module import create_data

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


## Load parameters as defined in confit

In [2]:


EMAI_PAD_LEN = config['train']['EMAI_PAD_LEN']
EMOJ_SENT_PAD_LEN = config['train']['EMOJ_SENT_PAD_LEN']
SENT_PAD_LEN = config['train']['SENT_PAD_LEN']
SENT_EMB_DIM = config['model']['SENT_EMB_DIM']
learning_rate = config['train']['learning_rate']
FILL_VOCAB = config['train']['FILL_VOCAB']
BATCH_SIZE = config['train']['BATCH_SIZE']

SENT_HIDDEN_SIZE = config['model']['SENT_HIDDEN_SIZE']
CTX_LSTM_DIM = config['model']['CTX_LSTM_DIM']

CLIP = config['train']['CLIP']
EARLY_STOP_PATIENCE = config['train']['EARLY_STOP_PATIENCE']
LAMBDA1 = config['train']['LAMBDA1']
LAMBDA2 = config['train']['LAMBDA2']
FLAT = config['train']['FLAT']
GAMMA = config['train']['GAMMA']
# fix random seeds to ensure replicability
RANDOM_SEED = config['train']['RANDOM_SEED']
NUM_OF_VOCAB = config['train']['NUM_OF_VOCAB']

GLOVE_EMB_PATH = config['emb']['glove_path']
bert_vocab_path = config['emb']['bert_vocab_path']

torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)


In [3]:
preprocessor = EnglishPreProcessor()
tokenizer = BertTokenizer(vocab_file=bert_vocab_path, do_lower_case=True)

print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
emoji_st = SentenceTokenizer(vocabulary, EMOJ_SENT_PAD_LEN)

word2id_path = config['infer']['word2id']
id2word_path = config['infer']['id2word']

with open(word2id_path, 'rb') as w:
    word2id = pkl.load(w)
with open(id2word_path, 'rb') as i:
    id2word = pkl.load(i)
num_of_vocab = len(word2id)

emb = create_data.build_embedding(id2word, GLOVE_EMB_PATH, num_of_vocab)

Tokenizing using dictionary from /data/torchMoji/model/vocabulary.json
loading pkl file
loading finished


100%|██████████| 20003/20003 [00:00<00:00, 377398.31it/s]

loading glove
15614 of 20003 found coverage 0.7805829125631155





## Build a Attention model without last layer
### and load weights from the fully trained model

In [4]:
model = HierarchicalAttPredictorHL(SENT_EMB_DIM, SENT_HIDDEN_SIZE, CTX_LSTM_DIM, num_of_vocab, SENT_PAD_LEN , id2word, USE_ELMO=True, ADD_LINEAR=False) 
model.load_embedding(emb)
model.deepmoji_model.load_specific_weights(PRETRAINED_PATH, exclude_names=['output_layer'])  
full_model = torch.load('/data/SuperMod/hapy_state_wiki_enr_imdb_4.pth')
# model.load_state_dict(full_model.state_dict)  
model.cuda()
model.eval()

Loading weights for embed.weight
Loading weights for lstm_0.weight_ih_l0
Loading weights for lstm_0.weight_hh_l0
Loading weights for lstm_0.bias_ih_l0
Loading weights for lstm_0.bias_hh_l0
Loading weights for lstm_0.weight_ih_l0_reverse
Loading weights for lstm_0.weight_hh_l0_reverse
Loading weights for lstm_0.bias_ih_l0_reverse
Loading weights for lstm_0.bias_hh_l0_reverse
Loading weights for lstm_1.weight_ih_l0
Loading weights for lstm_1.weight_hh_l0
Loading weights for lstm_1.bias_ih_l0
Loading weights for lstm_1.bias_hh_l0
Loading weights for lstm_1.weight_ih_l0_reverse
Loading weights for lstm_1.weight_hh_l0_reverse
Loading weights for lstm_1.bias_ih_l0_reverse
Loading weights for lstm_1.bias_hh_l0_reverse
Loading weights for attention_layer.attention_vector
Ignoring weights for output_layer.0.weight
Ignoring weights for output_layer.0.bias


HierarchicalAttPredictorHL(
  (deepmoji_model): TorchMoji(
    (embed): Embedding(50000, 256)
    (embed_dropout): Dropout2d(p=0.2)
    (lstm_0): LSTMHardSigmoid(256, 512, batch_first=True, bidirectional=True)
    (lstm_1): LSTMHardSigmoid(1024, 512, batch_first=True, bidirectional=True)
    (attention_layer): AttentionOneParaPerChan(2304)
    (final_dropout): Dropout2d(p=0.2)
  )
  (deepmoji2linear): Linear(in_features=2304, out_features=300, bias=True)
  (a_lstm): LSTM(1324, 1000, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (a_self_attention): AttentionOneParaPerChan(2000)
  (context_lstm): LSTM(4304, 100, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (ctx_self_attention): AttentionOneParaPerChan(200)
  (embeddings): Embedding(20003, 300, padding_idx=0)
  (context_to_emo): Linear(in_features=100, out_features=2, bias=True)
  (drop_out): Dropout(p=0.2)
  (out2label): Linear(in_features=200, out_features=1, bias=True)
)

## get final_test to try out how it works

In [6]:
final_test_file = '/data/SuperMod/final_test.csv'
final_test_data_list = create_data.load_data_context(data_path=final_test_file, is_train=False)

final_test_data_set = create_data.TestDataSet(final_test_data_list, EMAI_PAD_LEN, SENT_PAD_LEN, word2id, id2word, emoji_st, use_unk=False)
final_test_data_loader = DataLoader(final_test_data_set, batch_size= BATCH_SIZE, shuffle=False)


In [18]:
final_test_data_list[2]

['all , this was my idea .',
 'i have been suggesting this idea for three months , but no one was listening .',
 'what tina suggested was far from realistic .',
 'even if we were able to implement her idea , it would be a waste of time']

In [51]:
final_pred_list_test = []
final_pred_weights = []
sents = []
model.cuda()
for i, (a, a_len, emoji_a) in enumerate(final_test_data_loader):
        

        with torch.no_grad():

            out, weight = model(a.cuda(), a_len, emoji_a.cuda())
            
            sents.append(emoji_a.cpu().numpy())
            
                   
            final_pred_weights.append(weight.cpu().numpy())

            final_pred_list_test.append(out.cpu().numpy())

In [40]:
sum(final_pred_weights[0][9])

tensor(1.0000)

In [95]:
# def print_sent_and_weights (ind_of_sample):
    
#     word_list = final_test_data_list[ind_of_sample]
#     weight_list = final_pred_weights[0][ind_of_sample]
    
#     if len(word_list) > 10:
#         word_list = word_list[:10]
#     elif len(word_list) < 10:
#         weight_list = weight_list[:len(word_list)]
#         sum_new_weights = sum(weight_list)
#         weight_list = weight_list/sum(weight_list)
        
        
#     for words, score in zip(word_list, weight_list):
        
#         print(score, words)
    

In [97]:
def print_sent_and_weights (ind_of_sample):
    
    word_list = final_test_data_list[ind_of_sample]
    weight_list = final_pred_weights[0][ind_of_sample]
    
    if len(word_list) > 10:
        word_list = word_list[:10]
    elif len(word_list) < 10:
        weight_list = weight_list[:len(word_list)]
        sum_new_weights = sum(weight_list)
        weight_list = weight_list/sum(weight_list)
        tox_ind = np.argsort(weight_list)
        
    for words, score in zip(word_list, tox_ind):
        
        print(score, words)

In [66]:
print_sent_and_weights(2)

0 all , this was my idea .
1 i have been suggesting this idea for three months , but no one was listening .
2 what tina suggested was far from realistic .
3 even if we were able to implement her idea , it would be a waste of time


In [67]:
print_sent_and_weights(28)

0 even though what you have said is meaningless , i will do what you say


In [98]:
print_sent_and_weights(19)

0 feel free to pass this along to hr .
1 keep them in the loop for all i care .
2 may as well do it early .
4 you are a misogynist and a terrible human .
3 i will keep hounding you until one of us is fired .


In [60]:
print_sent_and_weights(26)

0.4999581 let ’ s not involve jim in this .
0.5000419 he tend to talk a lot during meetings and make them unnecessarily long .


In [75]:
print_sent_and_weights(45)

0.14253521 someone please explain to her , who was recently called racist by those in her own party , that there is nothing wrong with bringing out the very obvious fact that he has done a very poor job for his district and the city .
0.14285633 just take a look , the facts speak far louder than words !
0.14296398 the dem always play the race card , when in fact they have done so little for our nation ’ s great people .
0.14309411 now , lowest unemployment in u .
0.14286453 s .
0.14271094 history , and only getting better .
0.14297494 he has failed badly !


In [42]:
final_pred_weights[0][35]

tensor([0.1006, 0.1007, 0.1007, 0.1005, 0.1003, 0.1002, 0.1001, 0.0998, 0.0992,
        0.0979])

In [None]:
final_pred_weights[0][28]

In [45]:
final_pred_weights[0][3]

tensor([0.0997, 0.1000, 0.1002, 0.1006, 0.1006, 0.1008, 0.1007, 0.1002, 0.0994,
        0.0978])

In [43]:
final_test_data_list[2]

['all , this was my idea .',
 'i have been suggesting this idea for three months , but no one was listening .',
 'what tina suggested was far from realistic .',
 'even if we were able to implement her idea , it would be a waste of time']

In [44]:
final_test_data_list[35]

["it ' s all their fault we failed .",
 'no idea why anyone would hire pathetic losers like them .']

In [46]:
final_test_data_list[3]

['all , i understand we are deciding on an idea .',
 'i recommend going with option a .',
 'as i mentioned in our meeting , this option is realistic and easy to implement .',
 'we can get it up and running quickly .',
 'tina suggested option b .',
 'that was a viable option as well .',
 'option b would work better if we are not under such tight time pressure .']

## get a highlight test to see how well it works

In [312]:
highlight_test_file = '/data/SuperMod/highlight_test.csv'
highlight_test_data_list = create_data.load_data_context(data_path=highlight_test_file, is_train=False)

highlight_test_data_set = create_data.TestDataSet(highlight_test_data_list, EMAI_PAD_LEN, SENT_PAD_LEN, word2id, id2word, emoji_st, use_unk=False)
highlight_test_data_loader = DataLoader(highlight_test_data_set, batch_size= BATCH_SIZE, shuffle=False)


In [313]:
highlight_pred_list_test = []
highlight_pred_weights = []

model.cuda()
for i, (a, a_len, emoji_a) in enumerate(highlight_test_data_loader):
        

        with torch.no_grad():

            out, weight = model(a.cuda(), a_len, emoji_a.cuda())            
                   
            highlight_pred_weights.append(weight.cpu().numpy())

            highlight_pred_list_test.append(out.cpu().numpy())

In [314]:
def print_sent_and_weights_hl (ind_of_sample):
    
    word_list = highlight_test_data_list[ind_of_sample]
    weight_list = highlight_pred_weights[0][ind_of_sample]
    
    if len(word_list) > 10:
        word_list = word_list[:10]
    elif len(word_list) < 10:
        weight_list = weight_list[:len(word_list)]
        sum_new_weights = sum(weight_list)
        weight_list = weight_list/sum(weight_list)
    
    weight_list = np.asarray(weight_list)
    word_list =  np.asarray(word_list)
    
    tox_ind = np.argsort(weight_list)

    rank = [np.where(tox_ind ==  i)[0][0] for i in range(len(tox_ind))]
    
    print(weight_list)
    print(tox_ind)
    print(rank)

        

    for i, (words,  score, r) in enumerate(zip(word_list, weight_list, rank )):

        print(r, round(score,4), words)


In [315]:
print_sent_and_weights_hl(0)

[0.19918515 0.20000385 0.2001275  0.20010762 0.20057593]
[0 1 3 2 4]
[0, 1, 3, 2, 4]
0 0.1992 what tina suggested was far from realistic .
1 0.2 it would be a waste of time .
3 0.2001 i have been suggesting this idea for three months .
2 0.2001 no one was listening .
4 0.2006 this was my idea .


In [316]:
print_sent_and_weights_hl(1)

[0.19842854 0.19907728 0.19987172 0.2006671  0.20195535]
[0 1 2 3 4]
[0, 1, 2, 3, 4]
0 0.1984 this was my idea .
1 0.1991 i have been suggesting this idea for three months .
2 0.1999 no one was listening .
3 0.2007 what tina suggested was far from realistic .
4 0.202 it would be a waste of time .


In [317]:
print_sent_and_weights_hl(2)

[0.49925917 0.5007408 ]
[0 1]
[0, 1]
0 0.4993 i will do what you say .
1 0.5007 even though what you have said is meaningless .


In [318]:
print_sent_and_weights_hl(3)

[0.5000361  0.49996385]
[1 0]
[1, 0]
1 0.5 even though what you have said is meaningless .
0 0.5 i will do what you say


In [319]:
print_sent_and_weights_hl(4)

[0.49869844 0.5013015 ]
[0 1]
[0, 1]
0 0.4987 i have work to do .
1 0.5013 stop bothering me .


In [320]:
print_sent_and_weights_hl(5)

[0.4996464 0.5003536]
[0 1]
[0, 1]
0 0.4996 stop bothering me .
1 0.5004 i have work to do .


In [321]:
print_sent_and_weights_hl(6)

[0.49902079 0.5009792 ]
[0 1]
[0, 1]
0 0.499 i have work to do .
1 0.501 stop wasting my time with worthless ideas .


In [322]:
print_sent_and_weights_hl(7)

[0.4996745 0.5003255]
[0 1]
[0, 1]
0 0.4997 stop wasting my time with worthless ideas .
1 0.5003 i have work to do .


In [323]:
print_sent_and_weights_hl(8)

[0.25004327 0.24952266 0.249835   0.25059903]
[1 2 0 3]
[2, 0, 1, 3]
2 0.25 this is completely idiotic .
0 0.2495 the last idea is great .
1 0.2498 this is never going to work .
3 0.2506 i will think of another one .


In [324]:
print_sent_and_weights_hl(9)

[0.24978505 0.2500586  0.24971727 0.25043914]
[2 0 1 3]
[1, 2, 0, 3]
1 0.2498 i will think of another one .
2 0.2501 this is completely idiotic .
0 0.2497 the last idea is great .
3 0.2504 this is never going to work .


## Observations:
1. Later sentences tend to receive higher score
2. It makes sense because later sentences are built on prior ones
3. And earlier sentences can change meaning of later sentences. E.g. I have work to do sounds fine as first sentence, but after saying stop wasting my time, I have work to do becomes toxic. 

This shows that even though we kind of flags the emotion in our sentences, this is not always accurate in finding out the 'toxic sentence' - probably the downside of capturing context.

## Convert to .py and test run

In [327]:
!python get_highlight_ha.py -test_path '/data/SuperMod/highlight_test.csv'  -out_path '/data/SuperMod/hapy_state_wiki_enr_imdb_4.pth'

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...
[0m