In [2]:
import pandas as pd 
import numpy as np
import random
import os 
import argparse
import json
import torch
import pickle
import time
import matplotlib.pyplot as plt
from torch import nn
from attrdict import AttrDict
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification
from transformers import BertConfig, BertTokenizer, BertModel

In [11]:
default_path = os.getcwd()
data_path = os.path.join(default_path, '../data')
base_model = os.path.join(default_path, '../base-model')
model_path = os.path.join(default_path, '../models')
config_path = os.path.join(default_path, '../config')
log_path = os.path.join(default_path, '../log')
config_file = "bert-base.json"

In [12]:
label = dict()

label[0] = '우울'
label[1] = '무기력'
label[2] = '급격한 체중(식욕)변화'
label[3] = '수면장애'
label[4] = '정서불안'
label[5] = '피로'
label[6] = '과도한 죄책감 및 무가치함'
label[7] = '인지기능저하'
label[8] = '자살충동'
label[9] = '일상'
label

{0: '우울',
 1: '무기력',
 2: '급격한 체중(식욕)변화',
 3: '수면장애',
 4: '정서불안',
 5: '피로',
 6: '과도한 죄책감 및 무가치함',
 7: '인지기능저하',
 8: '자살충동',
 9: '일상'}

In [13]:
with open(os.path.join(config_path, 'training_config.json')) as f:
    training_config = AttrDict(json.load(f))
    
training_config.pad = 'max_length'
training_config.device = torch.device("cuda") if torch.cuda.is_available() else "cpu"

In [14]:
class BertDataset(Dataset):
    def __init__(self, data_file):
        self.data = data_file
    
    def __len__(self):
        return len(self.data.label)
    
    def reset_index(self):
        self.data.reset_index(inplace=True, drop=True)
    
    def __getitem__(self, idx):
        '''
        return text, label
        '''
        self.reset_index()
        text = self.data.text[idx]
        label = self.data.label[idx]
        return text, label

In [15]:
class BertProcessor():
    def __init__(self, training_config, tokenizer, truncation=True):
        self.tokenizer = tokenizer 
        self.max_len = 128
        self.pad = training_config.pad
        self.batch_size = training_config.train_batch_size
        self.truncation = truncation
    
    def convert_data(self, data_file):
        context2 = None    # single sentence classification

        batch_encoding = self.tokenizer.batch_encode_plus(
            [(data_file[idx][0], context2) for idx in range(len(data_file))],   # text, 
            max_length = self.max_len,
            padding = self.pad,
            truncation = self.truncation
        )
        
        features = []
        for i in range(len(data_file)):
            inputs = {k: batch_encoding[k][i] for k in batch_encoding}
            try:
                inputs['label'] = data_file[i][1] 
            except:
                inputs['label'] = 0 
            features.append(inputs)
        
        all_input_ids = torch.tensor([f['input_ids'] for f in features], dtype=torch.long)
        all_attention_mask = torch.tensor([f['attention_mask'] for f in features], dtype=torch.long)
        all_token_type_ids = torch.tensor([f['token_type_ids'] for f in features], dtype=torch.long)
        all_labels = torch.tensor([f['label'] for f in features], dtype=torch.long)

        dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
        return dataset
    
    def convert_sentence(self, sent_list):   # 사용자 입력 문장 1개 -> 입력 형태 변환
        context2 = None 
        batch_encoding = self.tokenizer.batch_encode_plus(
            [(sent_list, context2)], max_length=self.max_len, padding=self.pad, truncation=self.truncation
        )
        
        features = []
        inputs = {k: batch_encoding[k][0] for k in batch_encoding}
        inputs['label'] = 0 
        features.append(inputs)

        input_id = torch.tensor([f['input_ids'] for f in features], dtype=torch.long)
        input_am = torch.tensor([f['attention_mask'] for f in features], dtype=torch.long)
        input_tts = torch.tensor([f['token_type_ids'] for f in features], dtype=torch.long)
        input_lb = torch.tensor([f['label'] for f in features], dtype=torch.long)
        dataset = TensorDataset(input_id, input_am, input_tts, input_lb)
        return dataset
    
    def shuffle_data(self, dataset, data_type):
        if data_type == 'train':
            return RandomSampler(dataset)
        elif data_type == 'eval' or data_type == 'test':
            return SequentialSampler(dataset)
        
    def load_data(self, dataset, sampler):
        return DataLoader(dataset, sampler=sampler, batch_size=self.batch_size)

In [16]:
class BertRegressor(nn.Module):
    def __init__(self, config, model):
        super(BertRegressor, self).__init__()
        self.model = model
        self.linear = nn.Linear(config.hidden_size, 128)
        self.relu = nn.ReLU()
        self.out = nn.Linear(128, 1)
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        logits = outputs.last_hidden_state[:, 0, :]
        x = self.linear(logits)
        x = self.relu(x)
        score = self.out(x)
        return score 

In [17]:
class BertRegTester():
    def __init__(self, training_config, model):
        self.training_config = training_config
        self.model = model

    def get_label(self, test_dataloader, test_type):
        '''
        test_type: 0  -> Test dataset 
        test_type: 1  -> Test sentence
        '''
        preds = []
        labels = []

        for batch in test_dataloader:
            self.model.eval()   # self 안 붙이면 이상한 Output (BaseModelOutputWithPoolingAndCrossAttentions) 출력 
            batch = tuple(t.to(training_config.device) for t in batch)   # args.device: cuda 
            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }
                outputs = self.model(**inputs)
                if test_type == 0:
                    preds.extend(outputs.squeeze().detach().cpu().numpy())
                elif test_type == 1:
                    preds.extend(outputs[0].detach().cpu().numpy())            
            label = batch[3].detach().cpu().numpy()
            labels.extend(label)
        return preds, labels 

In [18]:
class BertClsTester():
    def __init__(self, training_config, model):
        self.training_config = training_config
        self.model = model

    def get_label(self, test_dataloader, test_type):
        '''
        test_type: 0  -> Test dataset 
        test_type: 1  -> Test sentence
        '''
        preds = []
        labels = []

        for batch in test_dataloader:
            self.model.eval()
            batch = tuple(t.to(self.training_config.device) for t in batch)   # args.device: cuda 
            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                    "labels": batch[3]
                }
                outputs = self.model(**inputs)
                test_loss, logits = outputs[:2] 
                pred = logits.detach().cpu().numpy()
                if test_type == 0:
                    preds.extend(np.argmax(pred, axis=1))
                elif test_type == 1:
                    preds.append(np.argmax(pred))  
            label = inputs["labels"].detach().cpu().numpy()
            labels.extend(label)
        return preds, labels 
    
    def get_f1_score(self, test_dataloader):
        y_pred, y_true = self.get_label(test_dataloader)
        return round(f1_score(y_true, y_pred, average='micro'), 3) 
     
    def get_cl_report(self, test_dataloader):
        y_pred, y_true = self.get_label(test_dataloader, 0)
        cr = classification_report(y_true, y_pred).split('\n')
        clr_df = []

        for idx, line in enumerate(cr):
            clr_df.append([])
            if line == '':
                continue

            word_list = line.strip().split(' ')

            for word in word_list:
                if word != '':
                    clr_df[idx].append(word)

        clr_df[-2][0] = ' '.join([clr_df[-2][0], clr_df[-2][1]])
        clr_df[-3][0] = ' '.join([clr_df[-3][0], clr_df[-3][1]])
        clr_df[-4].insert(1, ' ')
        clr_df[-4].insert(2, ' ')
        clr_df[0].insert(0, 'index')

        clr_df[-2].pop(1)
        clr_df[-3].pop(1)
        clr_df.pop(1)
        clr_df.pop(-1)
        clr_df.pop(-4)
        clr_df = pd.DataFrame(clr_df[1:], columns=clr_df[0])
        clr_df.index = clr_df['index']

        del clr_df['index']
        return clr_df

In [19]:
bws_tokenizer = BertTokenizer.from_pretrained(os.path.join(base_model, 'bert-small'), model_max_length=128)
bws_config = BertConfig.from_pretrained(os.path.join(base_model, 'bert-small', 'bert_config.json'), output_hidden_states=True, output_attentions=True)
bws_model = BertModel.from_pretrained(os.path.join(base_model, 'bert-small'), config=bws_config)

Some weights of the model checkpoint at F:\AuD\jupyter notebook\../base-model\bert-small were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
bws_config.max_position_embeddings = 128

In [12]:
bws_model.to(training_config.device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 512, padding_idx=0)
    (position_embeddings): Embedding(512, 512)
    (token_type_embeddings): Embedding(2, 512)
    (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=512, out_features=512, bias=True)
            (key): Linear(in_features=512, out_features=512, bias=True)
            (value): Linear(in_features=512, out_features=512, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=512, out_features=512, bias=True)
            (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [13]:
bws_processor = BertProcessor(training_config, bws_tokenizer)

In [14]:
bws_reg = BertRegressor(bws_config, bws_model)

In [15]:
bws_model_name = os.path.join(model_path, 'BWS.pt')

In [16]:
# model_reg.load_state_dict(torch.load(model_name))
bws_reg.load_state_dict(torch.load(bws_model_name, map_location=torch.device('cpu')))
bws_reg.to(training_config.device)

BertRegressor(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 512, padding_idx=0)
      (position_embeddings): Embedding(512, 512)
      (token_type_embeddings): Embedding(2, 512)
      (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=512, out_features=512, bias=True)
              (key): Linear(in_features=512, out_features=512, bias=True)
              (value): Linear(in_features=512, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=512, out_features=512, bias=True)
              (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=Tru

In [17]:
bws_tester = BertRegTester(training_config, bws_reg)

In [18]:
dsm_tokenizer = BertTokenizer.from_pretrained(os.path.join(base_model, 'bert-mini'), model_max_length=128)
dsm_config = BertConfig.from_pretrained(os.path.join(base_model, 'bert-mini', 'bert_config.json'), num_labels=10, output_hidden_states=True, output_attentions=True)
dsm_model = BertForSequenceClassification.from_pretrained(os.path.join(base_model, 'bert-mini'), config=dsm_config)

Some weights of the model checkpoint at F:\AuD\base-model\bert-mini were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were no

In [19]:
dsm_config.max_position_embeddings = 128

In [20]:
dsm_model.to(training_config.device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 256, padding_idx=0)
      (position_embeddings): Embedding(512, 256)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): LayerNorm((256,), eps=1e-12, element

In [21]:
dsm_processor = BertProcessor(training_config, dsm_tokenizer)

In [22]:
dsm_model_name = os.path.join(model_path, 'DSM-5.pt')

In [23]:
dsm_model.load_state_dict(torch.load(dsm_model_name, map_location=torch.device('cpu')))
dsm_model.to(training_config.device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 256, padding_idx=0)
      (position_embeddings): Embedding(512, 256)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): LayerNorm((256,), eps=1e-12, element

In [24]:
dsm_tester = BertClsTester(training_config, dsm_model)

In [25]:
def get_att_toks(input_text, tokenizer, model, num_words):
    input_text = input_text.replace("'m", " am").replace('.', ' ').replace(',', ' ')
    print(input_text)
    inputs = tokenizer.encode(input_text, return_tensors='pt').to(training_config.device)
    outputs = model(inputs)  # Run model
    attention = outputs[-1]  # Retrieve attention from model outputs
    tokens = tokenizer.convert_ids_to_tokens(inputs[0])  # Convert input ids to token strings
    att_metrics = outputs.attentions[-1][0]
    att_sum = list(map(sum, att_metrics))
    sorted_att = sum(att_sum).sort(descending=True)
    
    cnt = 0 
    tok_idx = []
    for idx in range(len(inputs[0])):
        if inputs[0][sorted_att.indices[idx]] == 101 or inputs[0][sorted_att.indices[idx]] == 102:
            continue
        tok_idx.append(sorted_att.indices[idx])
        cnt += 1
        if cnt == num_words:
            break 
    
    tok_list = [tokenizer.decode(inputs[0][int(tok)]) for tok in tok_idx]
    return tok_list

In [26]:
# input_text = "also I lost 30 pounds and I feel lethargic"
input_text = "I do not feel depressed today"

In [27]:
bws_data = bws_processor.convert_sentence(input_text)
bws_sampler = bws_processor.shuffle_data(bws_data, 'test')
bws_loader = bws_processor.load_data(bws_data, bws_sampler)

In [28]:
dsm_data = dsm_processor.convert_sentence(input_text)
dsm_sampler = dsm_processor.shuffle_data(dsm_data, 'test')
dsm_loader = dsm_processor.load_data(dsm_data, dsm_sampler)

In [29]:
bws_pred, _ = bws_tester.get_label(bws_loader, 1)
dsm_pred, _ = dsm_tester.get_label(dsm_loader, 1)

In [30]:
get_att_toks(input_text, bws_tokenizer, bws_model, 3)
# get_att_toks(input_text, dsm_tokenizer, dsm_model, 3)

I do not feel depressed today


['f e e l', 'd e p r e s s e d', 'd o']

In [31]:
bws_pred[0], label[dsm_pred[0]]

(8.110906, '우울')

#### CBT 

In [62]:
cbt = pd.read_csv(os.path.join(data_path, 'cbt.csv'), skiprows=8)

In [57]:
cbt_samp = cbt[['cbt', 'question', 'response (오재동)']]
cbt_samp.columns = ['cbt_no', 'chatbot', 'user']
cbt_samp.head(1)

Unnamed: 0,cbt_no,chatbot,user
0,CBT 1-1,안녕! 마음의 방에 온걸 환영해.,안녕


In [58]:
cbt_samp.user.fillna('공백', inplace=True)
cbt_samp

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


Unnamed: 0,cbt_no,chatbot,user
0,CBT 1-1,안녕! 마음의 방에 온걸 환영해.,안녕
1,,사람은 여러 가지의 감정을 느껴. 좋은 기분도 싫은 기분도 느끼게 되는데 그것은 자...,자신의 감정을 받아드리는게 중요하구나
2,,"울적한 기분이 들었을 때 그 후에 어떻게 행동할까, 어떻게 생각할까 하는 것은 별개...",그걸 알면서도 가끔은 감정이 행동으로 나도 모르게 이어지는거 같아
3,,"최근에 있었던 기분 좋았던 일이나 기분 나빴던 상황을 떠올려 볼래? 예를 들어, 학...",논문 쓰는걸 드디어 끝냈어 기분 좋아
4,,방금 연습해 본 것처럼 앞으로 일상에서 어떤 생각을 하고 어떤 감정이 들고 어떤 행...,좋아 앞으로 잘 부탁해
...,...,...,...
192,,마음의 방에 와서 새롭게 배운 것들은 뭐가 있어?,내 마음을 좀 더 돌아볼 수 있었던거 같아
193,,마음의 방에서 배운 것들 중에서 너에게 가장 도움이 되었던 건 뭐야?,부정적인 감정에 좀 더 잘 대처할 수 있을거 같아
194,,마음의 방에 온 이후에 생긴 삶의 변화는 어떤 것들이 있어?,큰 변화는 없어
195,,여기 오면서 예전에는 하지 못했거나 오랫동안 하지 않았던 것 중에서 하게 된 것도 있어?,딱히 없는거 같아


In [59]:
cbt_samp.to_csv(os.path.join(data_path, 'cbt_samp.csv'), index=False)

In [63]:
cbt_t1 = pd.read_csv(os.path.join(data_path, 'cbt_samp_t1.csv'))
cbt_t2 = pd.read_csv(os.path.join(data_path, 'cbt_samp_t2.csv'))

In [66]:
cbt_t1

Unnamed: 0,cbt_no,chatbot,user,translated
0,CBT 1-1,안녕! 마음의 방에 온걸 환영해.,안녕,Hello! Welcome to the mind room.
1,,사람은 여러 가지의 감정을 느껴. 좋은 기분도 싫은 기분도 느끼게 되는데 그것은 자...,자신의 감정을 받아드리는게 중요하구나,People feel various emotions. It makes me feel...
2,,"울적한 기분이 들었을 때 그 후에 어떻게 행동할까, 어떻게 생각할까 하는 것은 별개...",그걸 알면서도 가끔은 감정이 행동으로 나도 모르게 이어지는거 같아,It's another matter of how to act and how to t...
3,,"최근에 있었던 기분 좋았던 일이나 기분 나빴던 상황을 떠올려 볼래? 예를 들어, 학...",논문 쓰는걸 드디어 끝냈어 기분 좋아,Can you think of something that made you feel ...
4,,방금 연습해 본 것처럼 앞으로 일상에서 어떤 생각을 하고 어떤 감정이 들고 어떤 행...,좋아 앞으로 잘 부탁해,"As we just practiced, we will find out what th..."
...,...,...,...,...
192,,마음의 방에 와서 새롭게 배운 것들은 뭐가 있어?,내 마음을 좀 더 돌아볼 수 있었던거 같아,What are some new things you learned in the mi...
193,,마음의 방에서 배운 것들 중에서 너에게 가장 도움이 되었던 건 뭐야?,부정적인 감정에 좀 더 잘 대처할 수 있을거 같아,What was the most helpful thing you learned in...
194,,마음의 방에 온 이후에 생긴 삶의 변화는 어떤 것들이 있어?,큰 변화는 없어,What are some changes in your life after comin...
195,,여기 오면서 예전에는 하지 못했거나 오랫동안 하지 않았던 것 중에서 하게 된 것도 있어?,딱히 없는거 같아,Is there anything you haven't done before or h...


In [68]:
cbt_samp['chatbot_kor'] = cbt_t1.translated
cbt_samp['user_kor'] = cbt_t2.translated

cbt_samp.head(1)

Unnamed: 0,cbt_no,chatbot,user,chatbot_kor,user_kor
0,CBT 1-1,안녕! 마음의 방에 온걸 환영해.,안녕,Hello! Welcome to the mind room.,Hi.
