# Seq2Seq Q&A Chatbot 구현

### 데이터 취득

In [1]:
import numpy as np 
import pandas as pd 

df = pd.read_csv('https://raw.githubusercontent.com/songys/Chatbot_data/refs/heads/master/ChatbotData.csv')
df = df[['Q', 'A']]
df


Unnamed: 0,Q,A
0,12시 땡!,하루가 또 가네요.
1,1지망 학교 떨어졌어,위로해 드립니다.
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.
4,PPL 심하네,눈살이 찌푸려지죠.
...,...,...
11818,훔쳐보는 것도 눈치 보임.,티가 나니까 눈치가 보이는 거죠!
11819,훔쳐보는 것도 눈치 보임.,훔쳐보는 거 티나나봐요.
11820,흑기사 해주는 짝남.,설렜겠어요.
11821,힘든 연애 좋은 연애라는게 무슨 차이일까?,잘 헤어질 수 있는 사이 여부인 거 같아요.


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11823 entries, 0 to 11822
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Q       11823 non-null  object
 1   A       11823 non-null  object
dtypes: object(2)
memory usage: 184.9+ KB


### 데이터 전처리

##### 토커나이저 학습 (sentencepiece) 활용

- 접두사, 접미사 처리 (bos, eos)
    - Train() 인자 cmd 옵션을 추가 (`<bos>, <eos>, <pad>, <oov>...`)
    - set_encode_extra_options(':') <br>
      set_encode_extra_options('bos:') <br>
      set_encode_extra_options(':eos') <br>
      set_encode_extra_options('bos:eos') 

In [3]:
import sentencepiece as spt

with open("chat_practice.txt", "w", encoding="utf-8") as f:
    for q, a in zip(df['Q'], df['A']):
        f.write(str(q).strip() + "\n")
        f.write(str(a).strip() + "\n")

input_file="chat_practice.txt"
model_prefix = "chatbot_spm"
vocab_size = 8000
model_type='unigram'
cmd = f'--input={input_file} --model_prefix={model_prefix} --vocab_size={vocab_size} --model_type={model_type}'

spt.SentencePieceTrainer.Train(cmd)

In [4]:
sp = spt.SentencePieceProcessor()
sp.load(f'{model_prefix}.model')

sp.set_encode_extra_options("bos:eos")

True

In [5]:
print(sp.encode_as_pieces("안녕하세요"))
print(sp.encode_as_ids("안녕하세요"))

['<s>', '▁안녕하세요', '</s>']
[1, 3159, 2]


##### 학습용 데이터 Q_input, A_input, A_traget 생성

In [6]:
def make_dataset(df, sp):
    Q_input, A_input, A_target = [], [], []

    for q, a in zip(df["Q"], df["A"]):
        q_ids = sp.encode_as_ids(str(q).strip())
        a_ids = sp.encode_as_ids(str(a).strip())

        A_input.append(a_ids[:-1])   
        A_target.append(a_ids[1:])   

        Q_input.append(q_ids)

    return Q_input, A_input, A_target

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

Q_input, A_input, A_target = make_dataset(df, sp)

Q_max_len = max(len(seq) for seq in Q_input)
A_max_len = max(len(seq) for seq in A_input)

In [8]:
print(Q_max_len)
print(A_max_len)

26
36


In [9]:
Q_input_pad = pad_sequences(Q_input, maxlen=Q_max_len, padding='pre')
A_input_pad = pad_sequences(A_input, maxlen=A_max_len, padding='post')
A_target_pad = pad_sequences(A_target, maxlen=A_max_len, padding='post')

### 모델 생성 및 학습

In [10]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model

vocab_size = sp.get_piece_size()
embedding_dim = 128
hidden_dim = 256

##### 인코더 생성

In [11]:
encoder_inputs = Input(shape=(None,), dtype='int32', name='encoder_inputs')
enc_emb_layer = Embedding(vocab_size, embedding_dim, mask_zero=True, name="enc_embedding")
enc_emb = enc_emb_layer(encoder_inputs)
encoder_lstm = LSTM(hidden_dim, return_state=True, name="encoder_lstm")
_, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

##### 디코더(teacher-forcing 모델) 생성

In [12]:
decoder_inputs = Input(shape=(None,), dtype='int32', name="decoder_inputs")
dec_emb_layer = Embedding(vocab_size, embedding_dim, mask_zero=True, name="dec_embedding")
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(hidden_dim, return_sequences=True, return_state=True, name="decoder_lstm")
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax', name="decoder_dense")
decoder_outputs = decoder_dense(decoder_outputs)

##### 학습

In [13]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")

print(model.summary())

None


In [29]:
history = model.fit(
    [Q_input_pad, A_input_pad], 
    A_target_pad.reshape(*A_target_pad.shape, 1),  # sparse_categorical_crossentropy 맞추기 위해 차원추가
    batch_size=64,
    epochs=70,
    validation_split=0.2
)

Epoch 1/70
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 309ms/step - loss: 0.1261 - val_loss: 8.5287
Epoch 2/70
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 324ms/step - loss: 0.1163 - val_loss: 8.6001
Epoch 3/70
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 324ms/step - loss: 0.1065 - val_loss: 8.6335
Epoch 4/70
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 324ms/step - loss: 0.0976 - val_loss: 8.6950
Epoch 5/70
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 326ms/step - loss: 0.0887 - val_loss: 8.7492
Epoch 6/70
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 338ms/step - loss: 0.0807 - val_loss: 8.8142
Epoch 7/70
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 341ms/step - loss: 0.0743 - val_loss: 8.8411
Epoch 8/70
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 343ms/step - loss: 0.0698 - val_loss: 8.9124
Epoch 9/70
[1m1

### 모델 추론

##### 디코더 (추론 모델) 생성

In [30]:
encoder_model = Model(encoder_inputs, encoder_states)

In [None]:
# decoder_state_input_h = Input(shape=(hidden_dim,))
# decoder_state_input_c = Input(shape=(hidden_dim,))
# decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# dec_emb2 = Embedding(vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
# decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
# decoder_states2 = [state_h2, state_c2]
# decoder_outputs2 = decoder_dense(decoder_outputs2)

# decoder_model = Model(
#     [decoder_inputs] + decoder_states_inputs,
#     [decoder_outputs2] + decoder_states2
# )

In [31]:
dec_infer_inputs = Input(shape=(None,), dtype='int32', name="dec_infer_inputs")
state_in_h = Input(shape=(hidden_dim,), name="state_in_h")
state_in_c = Input(shape=(hidden_dim,), name="state_in_c")
states_in = [state_in_h, state_in_c]

dec_infer_emb = dec_emb_layer(dec_infer_inputs) 
dec_infer_outputs, out_h, out_c = decoder_lstm(dec_infer_emb, initial_state=states_in)  
dec_infer_logits = decoder_dense(dec_infer_outputs)  
decoder_model = Model([dec_infer_inputs] + states_in, [dec_infer_logits, out_h, out_c])


##### 추론 함수

In [32]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = np.array([[sp.bos_id()]])

    stop_condition = False
    decoded_sentence = []

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = int(np.argmax(output_tokens[0, -1, :]))
        sampled_word = sp.id_to_piece(sampled_token_index)

        if sampled_word in ['</s>', '<pad>'] or len(decoded_sentence) > A_max_len:
            stop_condition = True
        else:
            decoded_sentence.append(sampled_word)

        target_seq = np.array([[sampled_token_index]])
        states_value = [h, c]

    return ''.join(decoded_sentence).replace('▁', ' ').strip()

##### 테스트

### 간단한 Chatbot 구현

1. 사용자의 입력을 받아 (처리)
2. 추론 함수에 전달해서
3. 응답을 출력
4. 1~3 '종료' 전까지 반복

In [34]:
print("시작합니다. 종료하려면 '종료'를 입력 해 주세요.")
while True:
    user_input = input("나: ")
    if user_input.strip() == "종료":
        print("종료합니다.")
        break

    sp.set_encode_extra_options("")
    seq = sp.encode_as_ids(user_input)
    seq = pad_sequences([seq], maxlen=Q_max_len, padding='pre')

    reply = decode_sequence(seq)
    print(f"[입력] {user_input}")
    print(f"[응답] {reply}")

시작합니다. 종료하려면 '종료'를 입력 해 주세요.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[입력] 나 너무 피곤해
[응답] 네 말씀하세요.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[입력] 이번