In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Attention, Dense, LSTM, Embedding, \
    Bidirectional, Concatenate
from tensorflow.keras.models import Model

In [2]:
# TensorFlow 버전 확인
print(tf.__version__)

# 데이터 다운로드 및 준비
data_path = os.getenv('HOME') + '/aiffel/quest/EX16_transformer_chatbot/data/ChatbotData.csv'
data = pd.read_csv(data_path)

2.6.0


In [3]:
# 전처리 함수
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z가-힣?.!,]+", " ", sentence)
    sentence = sentence.strip()
    return sentence

def preprocess_data(data):
    data['Q'] = data['Q'].apply(preprocess_sentence)
    data['A'] = data['A'].apply(preprocess_sentence)
    return data

# 데이터 전처리
data = preprocess_data(data)

In [4]:
# 질문과 대답 데이터셋 생성
questions = data['Q']
answers = data['A']

# 질문과 대답 데이터셋 분리
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(questions) + list(answers))

# 단어 집합 크기
vocab_size = len(tokenizer.word_index) + 1
print('단어 집합 크기:', vocab_size)

단어 집합 크기: 20491


In [5]:
# 토크나이징 및 패딩
def tokenize_and_pad(texts, tokenizer, max_len):
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
    return padded_sequences

max_len = 30

# 질문 데이터 토크나이징 및 패딩
questions = tokenize_and_pad(questions, tokenizer, max_len)

# 대답 데이터 토크나이징 및 패딩
answers = tokenize_and_pad(answers, tokenizer, max_len)

In [6]:
# 학습 데이터와 검증 데이터 분리
num_samples = len(questions)
train_ratio = 0.8
train_samples = int(num_samples * train_ratio)

train_questions = questions[:train_samples]
train_answers = answers[:train_samples]

val_questions = questions[train_samples:]
val_answers = answers[train_samples:]

In [7]:
def transformer_model(vocab_size, max_len, embed_dim, num_heads, ff_dim, num_blocks):
    inputs = tf.keras.layers.Input(shape=(max_len, embed_dim))
    x = inputs

    for _ in range(num_blocks):
        x = transformer_block(x, embed_dim, num_heads, ff_dim)

    outputs = tf.keras.layers.Dense(vocab_size, activation='softmax')(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

In [8]:
# Positional Encoding 함수
def get_positional_encoding(positions, d_model):
    positions = tf.cast(positions, tf.float32)  # Cast positions to float32
    angle_rates = 1 / tf.math.pow(10000, (2 * tf.range(d_model // 2, dtype=tf.float32) / tf.cast(d_model, tf.float32)))
    angle_rads = tf.matmul(positions[:, :, tf.newaxis], angle_rates[tf.newaxis, :])

    # Arrays with even indices get the sine function applied
    sines = tf.math.sin(angle_rads[:, 0::2])
    # Arrays with odd indices get the cosine function applied
    cosines = tf.math.cos(angle_rads[:, 1::2])

    # Combine the arrays back into a single tensor
    angle_rads = tf.stack([sines, cosines], axis=-1)
    angle_rads = tf.reshape(angle_rads, (tf.shape(angle_rads)[0], -1))

    return angle_rads

In [9]:
# Multi-head Attention 레이어
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError("Embedding dimension should be divisible by number of heads.")
        self.projection_dim = embed_dim // num_heads
        self.query_dense = Dense(embed_dim)
        self.key_dense = Dense(embed_dim)
        self.value_dense = Dense(embed_dim)
        self.combine_heads = Dense(embed_dim)
    
    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights
    
    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def call(self, inputs):
        query = inputs['query']
        key = inputs['key']
        value = inputs['value']
        mask = inputs['mask']
        batch_size = tf.shape(query)[0]
        
        # Query, Key, Value에 각각 Dense 레이어 적용
        query = self.query_dense(query)
        key = self.key_dense(key)
        value = self.value_dense(value)
        
        # Multi-heads로 나누기
        query = self.separate_heads(query, batch_size)
        key = self.separate_heads(key, batch_size)
        value = self.separate_heads(value, batch_size)
        
        # Attention 적용
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(attention, (batch_size, -1, self.embed_dim))
        
        # Output 생성
        outputs = self.combine_heads(concat_attention)
        return outputs

In [10]:
def transformer_block(inputs, embed_dim, num_heads, ff_dim, rate=0.1):
    attn_outputs = MultiHeadAttention(embed_dim, num_heads)(inputs)
    attn_outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6)(inputs + attn_outputs)
    ffn_outputs = tf.keras.layers.Dense(ff_dim, activation='relu')(attn_outputs)
    ffn_outputs = tf.keras.layers.Dense(embed_dim)(ffn_outputs)
    ffn_outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attn_outputs + ffn_outputs)
    outputs = tf.keras.layers.Dropout(rate)(ffn_outputs)
    return outputs

In [11]:
# 하이퍼파라미터 설정
embed_dim = 256
num_heads = 8
ff_dim = 1024
num_blocks = 4

In [12]:
# 모델 생성
model = transformer_model(vocab_size, max_len, embed_dim, num_heads, ff_dim, num_blocks)

# Loss, Optimizer 설정
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

# 모델 학습
history = model.fit(train_questions, train_answers, batch_size=128, epochs=20, validation_data=(val_questions, val_answers))

TypeError: in user code:

    /tmp/ipykernel_393/333398800.py:28 call  *
        query = inputs['query']
    /opt/conda/lib/python3.9/site-packages/tensorflow/python/util/dispatch.py:206 wrapper
        return target(*args, **kwargs)
    /opt/conda/lib/python3.9/site-packages/tensorflow/python/ops/array_ops.py:1014 _slice_helper
        _check_index(s)
    /opt/conda/lib/python3.9/site-packages/tensorflow/python/ops/array_ops.py:888 _check_index
        raise TypeError(_SLICE_TYPE_ERROR + ", got {!r}".format(idx))

    TypeError: Only integers, slices (`:`), ellipsis (`...`), tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid indices, got 'query'


In [None]:
# 예측 함수
def predict(sentence):
    sentence = preprocess_sentence(sentence)
    sentence = tokenize_and_pad([sentence], tokenizer, max_len)
    result = model.predict(sentence)
    result = np.argmax(result, axis=-1)
    return result

In [None]:
# 예측 결과 확인
print(predict("안녕하세요"))