In [5]:
import pandas as pd 
import os 
import argparse
import json
import torch
from torch import nn
from transformers import BertForSequenceClassification
from attrdict import AttrDict
from transformers import BertConfig, BertTokenizer, BertModel

In [6]:
default_path = os.getcwd()
data_path = os.path.join(default_path, '../data')
base_model = os.path.join(default_path, '../base-model')
model_path = os.path.join(default_path, '../models')
config_path = os.path.join(default_path, '../config')
log_path = os.path.join(default_path, '../log')
config_file = "bert-base.json"

In [7]:
dsm_samp = pd.read_csv(os.path.join(data_path, 'dsm_samp_test.csv'))
dsm_samp.head(3)

Unnamed: 0,id,text,label
0,50gph3,every little insult even if it's online just h...,8
1,t3_wfhxs,"do you know why you're feeling depressed, or i...",0
2,58580,So I'm just gonna live in the countryside,9


In [5]:
tokenizer = BertTokenizer.from_pretrained(os.path.join(base_model, 'bert-mini'), model_max_length=128)
config = BertConfig.from_pretrained(os.path.join(base_model, 'bert-mini', 'bert_config.json'),\
                                    num_labels=10,\
                                    output_hidden_states=True,\
                                    output_attentions=True)
model = BertForSequenceClassification.from_pretrained(os.path.join(base_model, 'bert-mini'), config=config)

Some weights of the model checkpoint at F:\AuD\base-model\bert-mini were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were no

In [6]:
with open(os.path.join(config_path, 'training_config.json')) as f:
    training_config = AttrDict(json.load(f))

training_config.pad = 'max_length'
training_config.device = torch.device("cuda") if torch.cuda.is_available() else "cpu"

In [7]:
config.max_position_embeddings = 128

In [8]:
model_name = os.path.join(model_path, 'DSM-5.pt')

In [9]:
model.load_state_dict(torch.load(model_name, map_location=torch.device('cpu')))
model.to(training_config.device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 256, padding_idx=0)
      (position_embeddings): Embedding(512, 256)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): LayerNorm((256,), eps=1e-12, element

In [13]:
input_text = "I am so depressed today"
inputs = tokenizer.encode(input_text, return_tensors='pt').to(training_config.device)

In [14]:
inputs, len(inputs[0])

(tensor([[  101,  1045,  2572,  2061, 14777,  2651,   102]]), 7)

In [15]:
outputs = model(inputs)  # Run model
attention = outputs[-1]  # Retrieve attention from model outputs
tokens = tokenizer.convert_ids_to_tokens(inputs[0])  # Convert input ids to token strings

In [16]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 9.0596, -0.5454, -0.8855, -1.9694, -2.4331, -2.7073, -2.1461, -1.6226,
         -2.0392, -0.5773]], grad_fn=<AddmmBackward0>), hidden_states=(tensor([[[ 0.3584, -0.5670,  1.5455,  ..., -0.0145,  0.0475,  0.7534],
         [ 1.0136, -0.4370,  0.4381,  ..., -0.8308, -1.4829,  0.4043],
         [ 1.8820,  0.9794,  0.1584,  ...,  0.3295, -1.5544,  0.7176],
         ...,
         [-0.1143, -0.7724, -1.0670,  ...,  2.0412, -0.8892,  1.0714],
         [ 3.1112, -0.9919, -1.0718,  ..., -0.1175, -0.4586,  1.0868],
         [-0.5815, -0.7431,  0.7391,  ..., -0.9457, -0.1700,  0.6018]]],
       grad_fn=<NativeLayerNormBackward0>), tensor([[[ 0.7993, -0.8706,  0.7480,  ..., -0.0308, -0.3917,  1.3282],
         [ 1.4623, -0.0041, -0.3623,  ..., -0.2483, -0.7309,  1.7501],
         [ 0.9837,  0.0609, -0.0236,  ...,  1.0084, -1.6034,  1.8914],
         ...,
         [-0.3389, -0.4222, -0.7288,  ...,  2.4275, -1.2857,  2.2013],
         [ 2.1268, -1

In [17]:
len(outputs.attentions), outputs.attentions[0].size()

(4, torch.Size([1, 4, 7, 7]))

In [18]:
len(outputs.hidden_states), len(outputs.hidden_states[0]), len(outputs.hidden_states[0][0]), len(outputs.hidden_states[0][0][0])

(5, 1, 7, 256)

In [19]:
len(outputs.attentions), len(outputs.attentions[-1]), len(outputs.attentions[-1][0]), len(outputs.attentions[-1][0][0]), len(outputs.attentions[-1][0][0][0])

(4, 1, 4, 7, 7)

In [20]:
torch.squeeze(outputs.attentions[-1], dim=1).size()

torch.Size([1, 4, 7, 7])

In [21]:
outputs.keys()

odict_keys(['logits', 'hidden_states', 'attentions'])

In [28]:
outputs.attentions[-1][0][0][6], sum(outputs.attentions[-1][0][0][6])

(tensor([0.1116, 0.0753, 0.1094, 0.0953, 0.1468, 0.1031, 0.3585],
        grad_fn=<SelectBackward0>),
 tensor(1.0000, grad_fn=<AddBackward0>))

In [29]:
outputs.attentions[-1][0]

tensor([[[0.1173, 0.0451, 0.0841, 0.0784, 0.1292, 0.0754, 0.4705],
         [0.0791, 0.0389, 0.0519, 0.0610, 0.0959, 0.0663, 0.6068],
         [0.0862, 0.0375, 0.0584, 0.0652, 0.1024, 0.0662, 0.5842],
         [0.0942, 0.0395, 0.0627, 0.0753, 0.1168, 0.0733, 0.5381],
         [0.1030, 0.0406, 0.0628, 0.0742, 0.1175, 0.0744, 0.5274],
         [0.0790, 0.0364, 0.0593, 0.0660, 0.1088, 0.0760, 0.5743],
         [0.1116, 0.0753, 0.1094, 0.0953, 0.1468, 0.1031, 0.3585]],

        [[0.1637, 0.1049, 0.1804, 0.1774, 0.1790, 0.0292, 0.1653],
         [0.1365, 0.0950, 0.1679, 0.1865, 0.1581, 0.0265, 0.2295],
         [0.1386, 0.0976, 0.1629, 0.1874, 0.1600, 0.0252, 0.2283],
         [0.1465, 0.0937, 0.1688, 0.1907, 0.1545, 0.0240, 0.2217],
         [0.1432, 0.0973, 0.1549, 0.1765, 0.1567, 0.0307, 0.2408],
         [0.1260, 0.0926, 0.1757, 0.1944, 0.1532, 0.0320, 0.2261],
         [0.1308, 0.1295, 0.2042, 0.2050, 0.1920, 0.0294, 0.1091]],

        [[0.1746, 0.0899, 0.0876, 0.1535, 0.1599, 0.0557, 

In [30]:
outputs.attentions[-1][0].size(), outputs.attentions[-1][0][0].size() 

(torch.Size([4, 7, 7]), torch.Size([7, 7]))

In [31]:
outputs.attentions[-1][0][0][6].sort(descending=True)

torch.return_types.sort(
values=tensor([0.3585, 0.1468, 0.1116, 0.1094, 0.1031, 0.0953, 0.0753],
       grad_fn=<SortBackward0>),
indices=tensor([6, 4, 0, 2, 5, 3, 1]))

In [33]:
att_metrics = outputs.attentions[-1][0]
att_sum = list(map(sum, att_metrics))

In [34]:
sum(att_sum).sort(descending=True).indices

tensor([6, 4, 0, 3, 2, 1, 5])

In [35]:
sum(att_sum).sort(descending=True)

torch.return_types.sort(
values=tensor([9.1481, 5.0017, 4.1557, 3.4887, 2.8418, 1.9306, 1.4333],
       grad_fn=<SortBackward0>),
indices=tensor([6, 4, 0, 3, 2, 1, 5]))

In [39]:
print(inputs[0][3], inputs[0][1], inputs[0][2])
tokenizer.decode(2061), tokenizer.decode(1045), tokenizer.decode(2572), tokenizer.decode(1010), tokenizer.decode(1998)

tensor(2061) tensor(1045) tensor(2572)


('s o', 'i', 'a m', ',', 'a n d')

In [41]:
def get_att_toks(input_text, num_words):
    input_text = input_text.replace("'m", " am").replace('.', ' ').replace(',', ' ')
    print(input_text)
    inputs = tokenizer.encode(input_text, return_tensors='pt').to(training_config.device)
    outputs = model(inputs)  # Run model
    attention = outputs[-1]  # Retrieve attention from model outputs
    tokens = tokenizer.convert_ids_to_tokens(inputs[0])  # Convert input ids to token strings
    att_metrics = outputs.attentions[-1][0]
    # print(f'att_metric: {len(att_metrics[-1])}')
    att_sum = list(map(sum, att_metrics))
    # print(f'att_sum: {len(att_sum[0])}')
    sorted_att = sum(att_sum).sort(descending=True)
    
    cnt = 0 
    tok_idx = []
    for idx in range(len(inputs[0])):
        if inputs[0][sorted_att.indices[idx]] == 101 or inputs[0][sorted_att.indices[idx]] == 102:
            continue
        tok_idx.append(sorted_att.indices[idx])
        cnt += 1
        if cnt == num_words:
            break 
    
    tok_list = [tokenizer.decode(inputs[0][int(tok)]) for tok in tok_idx]
    return tok_list

In [42]:
input_text = "Anyway, thanks for listening"
# inputs = tokenizer.encode(input_text, return_tensors='pt').to(training_config.device)
get_att_toks(input_text, 15)

Anyway  thanks for listening


['t h a n k s', 'a n y w a y', 'l i s t e n i n g', 'f o r']

In [43]:
text = 'hey'
conv = pd.DataFrame([text], columns=['text'])
conv['speaker'] = 'user'
conv = conv[['speaker', 'text']]
conv

Unnamed: 0,speaker,text
0,user,hey


In [44]:
conv.loc[1] = ['chatbot', 'hello, nice to meet you']
conv.loc[2] = ['user', 'who are you ?']
conv.loc[3] = ['chatbot', 'I am a psychological counseling chatbot']
conv.loc[4] = ['user', 'ah-huh']
conv.loc[5] = ['chatbot', 'how are you ?']
conv.loc[6] = ['user', 'I am very depressed today']
conv.loc[7] = ['chatbot', 'what is the matter ?'] 
conv.loc[8] = ['user', 'I do not know why but just depressed']
conv.loc[9] = ['chatbot', 'when you are depressed, you have to move']
conv.loc[10] = ['user', 'also I lost 30 pounds and I feel lethargic']
conv.loc[11] = ['chatbot', 'that sounds too bad']
conv.loc[12] = ['user', 'Anyway, thanks for listening']
conv.loc[13] = ['chatbot', 'yes, see you next time']
conv

Unnamed: 0,speaker,text
0,user,hey
1,chatbot,"hello, nice to meet you"
2,user,who are you ?
3,chatbot,I am a psychological counseling chatbot
4,user,ah-huh
5,chatbot,how are you ?
6,user,I am very depressed today
7,chatbot,what is the matter ?
8,user,I do not know why but just depressed
9,chatbot,"when you are depressed, you have to move"


In [45]:
user_idx = [idx for idx in range(len(conv)) if idx % 2 == 0]
user_conv = conv.loc[user_idx]
user_conv.reset_index(inplace=True, drop=True)

In [46]:
tok_list = []
for idx in range(len(user_conv)):
    tok_list.append(get_att_toks(user_conv.text[idx], 3))

tok_list

hey
who are you ?
ah-huh
I am very depressed today
I do not know why but just depressed
also I lost 30 pounds and I feel lethargic
Anyway  thanks for listening


[['h e y'],
 ['?', 'y o u', 'w h o'],
 ['h u h', '-', 'a h'],
 ['d e p r e s s e d', 'a m', 'v e r y'],
 ['d e p r e s s e d', 'b u t', 'w h y'],
 ['# # h a r', 'p o u n d s', '3 0'],
 ['t h a n k s', 'a n y w a y', 'l i s t e n i n g']]

In [47]:
user_conv['tokens'] = tok_list
user_conv.tokens = user_conv.tokens

In [48]:
user_conv.tokens = user_conv.tokens.apply(lambda x: ', '.join(x))
user_conv

Unnamed: 0,speaker,text,tokens
0,user,hey,h e y
1,user,who are you ?,"?, y o u, w h o"
2,user,ah-huh,"h u h, -, a h"
3,user,I am very depressed today,"d e p r e s s e d, a m, v e r y"
4,user,I do not know why but just depressed,"d e p r e s s e d, b u t, w h y"
5,user,also I lost 30 pounds and I feel lethargic,"# # h a r, p o u n d s, 3 0"
6,user,"Anyway, thanks for listening","t h a n k s, a n y w a y, l i s t e n i n g"
