In [1]:
# 필요한 모듈 import
import pandas as pd
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPool1D, concatenate

In [3]:
# 데이터 읽어오기
train_file = './ChatBotData.csv'
data = pd.read_csv(train_file, delimiter = ',')
features = data['Q'].tolist()
labels = data['label'].tolist()

# 단어 인덱스 시퀀스 벡터
corpus = [preprocessing.text.text_to_word_sequence(text) for text in features]   # 말뭉치
corpus

[['12시', '땡'],
 ['1지망', '학교', '떨어졌어'],
 ['3박4일', '놀러가고', '싶다'],
 ['3박4일', '정도', '놀러가고', '싶다'],
 ['ppl', '심하네'],
 ['sd카드', '망가졌어'],
 ['sd카드', '안돼'],
 ['sns', '맞팔', '왜', '안하지ㅠㅠ'],
 ['sns', '시간낭비인', '거', '아는데', '매일', '하는', '중'],
 ['sns', '시간낭비인데', '자꾸', '보게됨'],
 ['sns보면', '나만', '빼고', '다', '행복해보여'],
 ['가끔', '궁금해'],
 ['가끔', '뭐하는지', '궁금해'],
 ['가끔은', '혼자인게', '좋다'],
 ['가난한', '자의', '설움'],
 ['가만', '있어도', '땀난다'],
 ['가상화폐', '쫄딱', '망함'],
 ['가스불', '켜고', '나갔어'],
 ['가스불', '켜놓고', '나온거', '같아'],
 ['가스비', '너무', '많이', '나왔다'],
 ['가스비', '비싼데', '감기', '걸리겠어'],
 ['가스비', '장난', '아님'],
 ['가장', '확실한', '건', '뭘까'],
 ['가족', '여행', '가기로', '했어'],
 ['가족', '여행', '고고'],
 ['가족', '여행', '어디로', '가지'],
 ['가족', '있어'],
 ['가족관계', '알려', '줘'],
 ['가족끼리', '여행간다'],
 ['가족들', '보고', '싶어'],
 ['가족들이랑', '서먹해'],
 ['가족들이랑', '서먹해졌어'],
 ['가족들이랑', '어디', '가지'],
 ['가족들이랑', '여행', '갈거야'],
 ['가족여행', '가야지'],
 ['가족이', '누구야'],
 ['가족이랑', '여행', '가려고'],
 ['가족한테', '스트레스', '풀었어'],
 ['가출할까'],
 ['가출해도', '갈', '데가', '없어'],
 ['간만에', '떨리니까', '좋더라'],
 ['간만에', '쇼핑', '

In [5]:
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(corpus)
sequences = tokenizer.texts_to_sequences(corpus)
word_index = tokenizer.word_index
word_index

{'너무': 1,
 '좋아하는': 2,
 '거': 3,
 '싶어': 4,
 '같아': 5,
 '안': 6,
 '나': 7,
 '좀': 8,
 '사람': 9,
 '내가': 10,
 '싶다': 11,
 '어떻게': 12,
 '썸': 13,
 '왜': 14,
 '내': 15,
 '사람이': 16,
 '이별': 17,
 '것': 18,
 '잘': 19,
 '없어': 20,
 '수': 21,
 '오늘': 22,
 '자꾸': 23,
 '이제': 24,
 '있어': 25,
 '또': 26,
 '많이': 27,
 '다': 28,
 '있을까': 29,
 '헤어진지': 30,
 '해': 31,
 '다시': 32,
 '될까': 33,
 '여자친구가': 34,
 '남자친구가': 35,
 '더': 36,
 '진짜': 37,
 '정말': 38,
 '게': 39,
 '나를': 40,
 '뭐': 41,
 '좋아': 42,
 '할까': 43,
 '하고': 44,
 '하는': 45,
 '연애': 46,
 '있는': 47,
 '계속': 48,
 '힘드네': 49,
 '연락': 50,
 '이': 51,
 '나만': 52,
 '먹고': 53,
 '이렇게': 54,
 '있는데': 55,
 '못': 56,
 '날': 57,
 '혼자': 58,
 '다른': 59,
 '방법': 60,
 '타는': 61,
 '한': 62,
 '그': 63,
 '안돼': 64,
 '그냥': 65,
 '없는': 66,
 '돼': 67,
 '짝남이': 68,
 '좋겠다': 69,
 '선물': 70,
 '모르겠어': 71,
 '같이': 72,
 '나한테': 73,
 '같은데': 74,
 '싫어': 75,
 '친구가': 76,
 '마음이': 77,
 '짝사랑': 78,
 '가고': 79,
 '사랑': 80,
 '헤어진': 81,
 '많아': 82,
 '힘들어': 83,
 '연락이': 84,
 '줄': 85,
 '좋겠어': 86,
 '술': 87,
 '후': 88,
 '짝남': 89,
 '듯': 90,
 '좋은': 91,
 '좋을까

In [6]:
MAX_SEQ_LEN = 15  # 단어 시퀀스 벡터 크기
padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post')

In [8]:
# train, validation, test dataset 생성
#  7 : 2 : 1
dataset = tf.data.Dataset.from_tensor_slices((padded_seqs, labels))
dataset = dataset.shuffle(len(features))

train_size = int(len(padded_seqs) * 0.7)
val_size = int(len(padded_seqs) * 0.2)
test_size = int(len(padded_seqs) * 0.1)

train_ds = dataset.take(train_size).batch(20)
val_ds = dataset.skip(train_size).take(val_size).batch(20)
test_ds = dataset.skip(train_size + val_size).take(test_size).batch(20)

# 하이퍼파라미터 설정
dropout_prob = 0.5
EMB_SIZE = 128
EPOCH = 5
VOCAB_SIZE = len(word_index) + 1  # 전체 단어 수

In [10]:
# CNN 모델 정의
input_layer = Input(shape = (MAX_SEQ_LEN,))
embedding_layer = Embedding(VOCAB_SIZE, EMB_SIZE, input_length=MAX_SEQ_LEN)(input_layer)
dropout_emb = Dropout(rate=dropout_prob)(embedding_layer)

conv1 = Conv1D(
    filters=128,
    kernel_size=3,
    padding='valid',
    activation=tf.nn.relu)(dropout_emb)
pool1 = GlobalMaxPool1D()(conv1)

conv2 = Conv1D(
    filters=128,
    kernel_size=4,
    padding='valid',
    activation=tf.nn.relu)(dropout_emb)
pool2 = GlobalMaxPool1D()(conv2)

conv3 = Conv1D(
    filters=128,
    kernel_size=5,
    padding='valid',
    activation=tf.nn.relu)(dropout_emb)
pool3 = GlobalMaxPool1D()(conv3)

# 3, 4, 5-gram 이후 합치기
concat = concatenate([pool1, pool2, pool3])

hidden = Dense(128, activation=tf.nn.relu)(concat)
dropout_hidden = Dropout(rate=dropout_prob)(hidden)
logits = Dense(3, name='logits')(dropout_hidden)
predictions = Dense(3, activation=tf.nn.softmax)(logits)

In [12]:
# 모델 생성
model = Model(inputs=input_layer, outputs=predictions)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [13]:
# 모델 학습
model.fit(train_ds, validation_data=val_ds, epochs=EPOCH, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x29f91bc4fd0>

In [15]:
# 모델 평가(테스트 데이터셋 이용)
loss, accuracy = model.evaluate(test_ds, verbose=1)
print('Accuracy: %f' %(accuracy * 100))
print('loss: %f' %(loss))

# 모델 저장
model.save('cnn_model.h5')

Accuracy: 97.800338
loss: 0.064536


In [17]:
from tensorflow.keras.models import load_model
# 감정 분류 CNN모델 불러오기
model = load_model('cnn_model.h5')
model.summary()
model.evaluate(test_ds, verbose=2)

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 15)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 15, 128)      1715072     input_1[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 15, 128)      0           embedding[0][0]                  
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 13, 128)      49280       dropout[0][0]                    
____________________________________________________________________________________________

[0.06181996315717697, 0.9796954393386841]

In [18]:
# 테스트용 데이터셋의 10200번째 데이터 출력
print('단어 시퀀스 : ', corpus[10200])
print('단어 인덱스 시퀀스 : ', padded_seqs[10200])
print('문장 분류(정답) : ', labels[10200])

단어 시퀀스 :  ['썸', '타는', '사람이랑', '밥', '먹기로', '함']
단어 인덱스 시퀀스 :  [  13   61  155  251 2361  664    0    0    0    0    0    0    0    0
    0]
문장 분류(정답) :  2


In [19]:
# 테스트용 데이터셋의 10200번째 데이터 감정 예측
picks = [10200]
predict = model.predict(padded_seqs[picks])
predict_class = tf.math.argmax(predict, axis=1)
print('감정 예측 점수 : ', predict)
print('감정 예측 클래스 : ', predict_class.numpy())

감정 예측 점수 :  [[4.7291091e-09 5.0719393e-09 1.0000000e+00]]
감정 예측 클래스 :  [2]
