In [1]:
!pip install transformers



In [2]:
import tensorflow as tf
from transformers import AutoTokenizer
from transformers import TFGPT2LMHeadModel

In [3]:
tokenizer = AutoTokenizer.from_pretrained('skt/kogpt2-base-v2', bos_token='</s>', eos_token='</s>', pad_token='<pad>')
model = TFGPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2', from_pt=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFGPT2LMHeadModel: ['transformer.h.3.attn.masked_bias', 'transformer.h.9.attn.masked_bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.7.attn.masked_bias', 'transformer.h.8.attn.masked_bias', 'transformer.h.1.attn.masked_b

In [4]:
print(tokenizer.bos_token_id) #문장 시작 토큰
print(tokenizer.eos_token_id) #문장 종료 토큰
print(tokenizer.pad_token_id)
print('-' * 10)
print(tokenizer.decode(1))
print(tokenizer.decode(2))#User
print(tokenizer.decode(3))
print(tokenizer.decode(4))#System


1
1
3
----------
</s>
<usr>
<pad>
<sys>


# 2) 챗봇 데이터 로드

In [5]:
import pandas as pd
import tqdm
import urllib.request

In [6]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv", filename="ChatBotData.csv")
train_data = pd.read_csv('ChatBotData.csv')
print('챗봇 데이터의 개수 :', len(train_data))


챗봇 데이터의 개수 : 11823


# 3) 챗봇 데이터 전처리

In [7]:
def get_chat_data():
  for question, answer in zip(train_data.Q.to_list(), train_data.A.to_list()):
    bos_token = [tokenizer.bos_token_id]
    eos_token = [tokenizer.eos_token_id]
    sent = tokenizer.encode('<usr>' + question + '<sys>' + answer)
    yield bos_token + sent + eos_token

In [8]:
batch_size = 32

In [9]:
dataset = tf.data.Dataset.from_generator(get_chat_data, output_types=tf.int32, output_shapes = (None,))

In [10]:
dataset = dataset.padded_batch(batch_size=batch_size, padded_shapes=(None,), padding_values=tokenizer.pad_token_id)

In [11]:
for batch in dataset.take(1):
    print (batch)

tf.Tensor(
[[    1     2  9349  7888   739  7318   376     4 12557  6824  9108  9028
   7098 25856     1     3     3     3     3     3     3     3     3     3
      3     3     3     3     3     3]
 [    1     2  9020  8263  7497 10192 11615  8210  8006     4 12422  8711
   9535  7483 12521     1     3     3     3     3     3     3     3     3
      3     3     3     3     3     3]
 [    1     2  9085  7597   395  8149 10624  7397 24224 13358  7182     4
  12079  8135 16899  9677  8234   389     1     3     3     3     3     3
      3     3     3     3     3     3]
 [    1     2  9085  7597   395  8149  9465 10624  7397 24224 13358  7182
      4 12079  8135 16899  9677  8234   389     1     3     3     3     3
      3     3     3     3     3     3]
 [    1     2  9943   422   418  9327  8702  7098     4  9847 16912 18328
   8671  7415  8263  8234   389     1     3     3     3     3     3     3
      3     3     3     3     3     3]
 [    1     2  9815   410 21249 10174  6824  8210  800

In [12]:
print(tokenizer.decode(batch[0]))

</s><usr> 12시 땡!<sys> 하루가 또 가네요.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


# 4) 챗봇 학습하기

In [13]:
adam = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)

In [14]:
steps = len(train_data) // batch_size + 1
print(steps)

370


In [15]:
print(type(batch), batch.shape, batch.dtype)


<class 'tensorflow.python.framework.ops.EagerTensor'> (32, 30) <dtype: 'int32'>


In [16]:
EPOCHS = 3

for epoch in range(EPOCHS):
  epoch_loss = 0

  for batch in tqdm.tqdm_notebook(dataset, total=steps):
      with tf.GradientTape() as tape:
          result = model(input_ids = batch, labels=batch)
          loss = result[0]
          batch_loss = tf.reduce_mean(loss)

      grads = tape.gradient(batch_loss, model.trainable_variables)
      adam.apply_gradients(zip(grads, model.trainable_variables))
      epoch_loss += batch_loss / steps

  print('[Epoch: {:>4}] cost = {:>.9}'.format(epoch + 1, epoch_loss))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm.tqdm_notebook(dataset, total=steps):


  0%|          | 0/370 [00:00<?, ?it/s]



[Epoch:    1] cost = 2.12707782


  0%|          | 0/370 [00:00<?, ?it/s]

[Epoch:    2] cost = 1.69828296


  0%|          | 0/370 [00:00<?, ?it/s]

[Epoch:    3] cost = 1.37568295


# 5) 챗봇 실행하기

In [17]:
text = '오늘도 좋은 하루!'

sent = '<usr>' + text + '<sys>'

input_ids = [tokenizer.bos_token_id] + tokenizer.encode(sent)
input_ids = tf.convert_to_tensor([input_ids])
print("정수 인코딩 후:", input_ids)
print("정수 인코딩을 재복원:", tokenizer.decode(input_ids[0]))

정수 인코딩 후: tf.Tensor([[    1     2 10070  7235 10586 12557   376     4]], shape=(1, 8), dtype=int32)
정수 인코딩을 재복원: </s><usr> 오늘도 좋은 하루!<sys>


In [18]:
output = model.generate(input_ids, max_length = 50, early_stopping = True, eos_token_id = tokenizer.eos_token_id)
decode_sentence = tokenizer.decode(output[0].numpy().tolist())
print(decode_sentence)

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


</s><usr> 오늘도 좋은 하루!<sys> 오늘도 좋은 하루네요.</s>


In [19]:
# 챗봇 답변만 확인
print(decode_sentence.split('<sys> ')[1].replace('</s>', ''))

오늘도 좋은 하루네요.


In [20]:
#랜덤성 부여
output = model.generate(input_ids, max_length = 50, do_sample = True, top_k = 10)
decoded_sentence = tokenizer.decode(output[0].numpy().tolist())
print(decoded_sentence.split('<sys> ')[1].replace('</s>', ''))

오늘도 좋은 하루로 보답하세요.


In [28]:
def return_answer(user_text):
    sent = '<usr>' + user_text + '<sys>'
    input_ids = [tokenizer.bos_token_id] + tokenizer.encode(sent)
    input_ids = tf.convert_to_tensor([input_ids])
    output = model.generate(input_ids, max_length = 50, do_sample = True, top_k = 10)
    sentence = tokenizer.decode(output[0].numpy().tolist())
    chatbot_response = sentence.split('<sys> ')[1].replace('</s>', '')
    return chatbot_response

In [29]:
return_answer("안녕 반가워")

'안녕이 <unk>니요.'

In [30]:
return_answer("너무 심심한데 나랑 놀자")

'짝사랑으로 이어질 수 있네요.'

In [31]:
return_answer("너 코딩 좀 할 줄 아니?")

'그럴 수 있어요.'

In [32]:
return_answer('너 딥 러닝 잘해?')

'저도 궁금해요.'

In [33]:
return_answer('너 딥 러닝 잘해?')

'잘하는 걸로 결론이 나면 그게 진짜 목표예요.'

In [34]:
return_answer('너 딥 러닝 잘해?')

'직접 해보는 게 덜 고민인거 같아요.'

In [35]:
return_answer('너 딥 러닝 정말 잘해?')

'인공지능에 어떤 기술을 탑사이클로 하는지 알아보세요.'