### 0. Environment Settings 

#### 1) Import Library

In [47]:
import pandas as pd 
import numpy as np
import pymysql
import os 
import torch
import time
import math
import torch.nn as nn 
import torch.nn.functional as F

#### 2) MySQL Connect 

In [3]:
conn = pymysql.connect(host='localhost', user='lamda_00', password='lamda95', db='chatbot', charset='utf8')
curs = conn.cursor()

In [4]:
def execute_sql(sql):
    curs.execute(sql)
    
    return curs.fetchall()

In [5]:
sql = 'SHOW TABLES;'

execute_sql(sql)

(('context_df',),
 ('intensity_df',),
 ('polarity_df',),
 ('response_df',),
 ('wellness_df',))

#### 3) Load data

In [6]:
sql = 'DESC wellness_df;'

execute_sql(sql)

(('idx', 'int(11)', 'NO', 'PRI', None, ''),
 ('intent', 'varchar(100)', 'NO', '', None, ''),
 ('keyword', 'varchar(100)', 'NO', '', None, ''),
 ('utterance', 'varchar(1000)', 'NO', '', None, ''),
 ('intent_label', 'int(11)', 'NO', '', None, ''),
 ('intent_keyword', 'varchar(100)', 'NO', '', None, ''),
 ('ik_label', 'int(11)', 'NO', '', None, ''))

In [7]:
sql = 'DESC response_df;'

execute_sql(sql)

(('idx', 'int(11)', 'NO', 'PRI', None, ''),
 ('Question', 'varchar(1000)', 'NO', '', None, ''),
 ('Answer', 'varchar(1000)', 'NO', '', None, ''))

In [8]:
sql = 'SELECT Question, Answer FROM response_df;'

data = execute_sql(sql)
data[:5]

(('12시 땡!', '하루가 또 가네요.'),
 ('1지망 학교 떨어졌어', '위로해 드립니다.'),
 ('3박4일 놀러가고 싶다', '여행은 언제나 좋죠.'),
 ('3박4일 정도 놀러가고 싶다', '여행은 언제나 좋죠.'),
 ('PPL 심하네', '눈살이 찌푸려지죠.'))

In [9]:
context_list = list(map(lambda x: x[0], data))
candidate_list = list(map(lambda x: x[1], data))

candidate_list[:3]

['하루가 또 가네요.', '위로해 드립니다.', '여행은 언제나 좋죠.']

#### 4) Stop MySQL

In [10]:
conn.close()

In [11]:
len(candidate_list)

11823

### 1. Pretrained tokenizer, model load 

In [16]:
data_path = './'

In [12]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained(
    "beomi/kcbert-base",
    do_lower_case=False,
)

In [13]:
from transformers import BertConfig, BertModel

pretrained_model_config = BertConfig.from_pretrained(
    "beomi/kcbert-base"
)

model = BertModel.from_pretrained(
    "beomi/kcbert-base",
    config=pretrained_model_config,
)

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
## GPU
device = torch.device("cuda:0")
model = model.to(device)

### 2. Candidate Embedding 생성

In [21]:
cand_emb_list = np.load(os.path.join(data_path, 'cand_emb.npy'))
cand_emb_list = torch.Tensor(cand_emb_list).to(device)

### 3. Bi-encoder

In [34]:
def get_answer_bi(ctxt):
    '''
    ctxt을 입력으로 받아 가장 높은 score를 보이는 후보 답변 반환 
    '''
    
    c_list = [] 
    c_list.append(ctxt)
    
    con_features = tokenizer(   # CLS 토큰: input_ids - 2, SEP 토큰: input_ids - 3
        c_list,
        max_length=12,
        padding="max_length",
        truncation=True,
    )
    
    con_features = {k: torch.tensor(v).to(device) for k, v in con_features.items()}
    con_features = con_features
    con_outputs = model(**con_features)
    ctxt_emb = con_outputs.last_hidden_state[0][0].cpu().detach().numpy()
    
    score = []
    for cand_emb in cand_emb_list:
        cand_emb = cand_emb.cpu().detach().numpy()
        score.append(np.dot(ctxt_emb, cand_emb))
    
    return candidate_list[np.argmax(score)]

In [36]:
context = context_list[30]

start = time.time()
print(f'===== Bi-encoder 구현 ====== ') 
print(f'입력 문장: {context}')
print(f'챗봇 대답: {get_answer_bi(context)}')
print(f'소요 시간: {round(time.time() - start, 2)}(s), len(candidate): {len(candidate_list)}')
print(f'==============================')

입력 문장: 가족들이랑 서먹해
챗봇 대답: 마음이랑 잘 인사해요.
소요 시간: 0.24(s), len(candidate): 11823


### 4. Cross-encoder

In [38]:
from sentence_transformers.cross_encoder import CrossEncoder

cross_encoder = CrossEncoder('beomi/kcbert-base', max_length=512)

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initiali

In [39]:
def get_answer_cross(ctxt):
    '''
    ctxt을 입력으로 받아 가장 높은 score를 보이는 후보 답변 반환 
    '''
    sentence_combinations = [[ctxt, candidate] for candidate in candidate_list]
    similarity_scores = cross_encoder.predict(sentence_combinations)
    return candidate_list[np.argmax(similarity_scores)]

In [41]:
context = context_list[30]

start = time.time()
print('===== Cross-encoder 구현 =====')
print(f'입력 문장: {context}')
print(f'챗봇 대답: {get_answer_cross(context)}')
print(f'소요 시간: {round(time.time() - start, 2)}(s), len(candidate): {len(candidate_list)}')
print(f'==============================')

===== Cross-encoder 구현 =====
입력 문장: 가족들이랑 서먹해
챗봇 대답: 당신은 정말 멋진 사람이에요. 깍아 내리지 마세요.
소요 시간: 4.35(s), len(candidate): 11823


### 5. Poly-encoder

In [49]:
batch_size = con_features['input_ids'].size(0)   # 11823

NameError: name 'con_features' is not defined

In [52]:
def get_poly_code(batch_size, num_poly_codes):
    poly_code_embeddings = nn.Embedding(num_poly_codes, 768)
    poly_code_ids = torch.arange(num_poly_codes, dtype=torch.long)
    poly_code_ids = poly_code_ids.unsqueeze(0).expand(batch_size, num_poly_codes)
    poly_codes = poly_code_embeddings(poly_code_ids).to(device)
    
    return poly_codes

In [59]:
def dot_attention(query, key, value):
    # start = time.time() 
    d_k = key.shape[-1]   # 차원 수
    attention_score = torch.matmul(query, key.transpose(-2, -1))  # Q x K^T 
    attention_score = attention_score / math.sqrt(d_k)
    attention_prob = F.softmax(attention_score, dim=-1) 
    out = torch.matmul(attention_prob, value)
    # print(f'attention: {round(time.time() - start, 4)}(s)')
    return out 

In [60]:
def get_score(contexts, responses):
    contexts = dot_attention(responses, contexts, contexts)
    # print(np.shape(contexts[2]))
    # print(np.shape(responses))
    score = (contexts * responses).sum(-1)
    return int(score[0][0])

In [61]:
def get_answer_poly(ctxt):
    '''
    ctxt을 입력으로 받아 가장 높은 score를 보이는 후보 답변 반환 
    '''
    start = time.time()
    c_list = [] 
    c_list.append(ctxt)
    num_poly_codes = 64
    
    con_features = tokenizer(   # CLS 토큰: input_ids - 2, SEP 토큰: input_ids - 3
        c_list,
        max_length=12,
        padding="max_length",
        truncation=True,
    )
    
    con_features = {k: torch.tensor(v).to(device) for k, v in con_features.items()}
    con_outputs = model(**con_features)
    batch_size= con_features['input_ids'].size(0)   # 1
    poly_codes = get_poly_code(batch_size, num_poly_codes)
    print(f'소요 시간: {round(time.time() - start, 2)}(s)')
    
    keys = con_outputs[0].to(device); values = con_outputs[0].to(device)
    contexts = dot_attention(poly_codes, keys, values)
    
    print(f'소요 시간2: {round(time.time() - start, 2)}(s)')
    
    # contexts = contexts.detach().numpy()
    print(np.shape(contexts))
    score = []
    for can_emb in cand_emb_list:
        con_emb = dot_attention(can_emb, contexts, contexts)
        con_emb = con_emb.cpu().detach().numpy()
        can_emb = can_emb.cpu().detach().numpy()
        score.append(np.dot(con_emb, can_emb))
        
    print(f'소요 시간3: {round(time.time() - start, 2)}(s)')
    return candidate_list[np.argmax(score)]

In [62]:
context = '오늘 날씨 어때 ?'

start = time.time()
print(f'===== Poly-encoder 구현 ====== ') 
print(f'입력 문장: {context}')
print(f'챗봇 대답: {get_answer_poly(context)}')
print(f'소요 시간: {round(time.time() - start, 2)}(s), len(candidate): {len(candidate_list)}')
print(f'==============================')

입력 문장: 오늘 날씨 어때 ?
소요 시간: 0.01(s)
소요 시간2: 0.01(s)
torch.Size([1, 64, 768])
소요 시간3: 2.44(s)
챗봇 대답: 오늘 미세먼지가 많데요.
소요 시간: 2.45(s), len(candidate): 11823
