In [1]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np 
import pandas as pd
from keras.models import Sequential
from keras.layers import *

# 한글닉네임 불러오기

In [2]:
# 전체 닉네임리스트 (글자딕셔너리 만들기 위해)
df = pd.read_csv("nick_korean.csv")
knick = list(df['닉네임'])
# 두글자 닉네임 따로 떼기 (DNN모델을 따로 만들기 위해)
k_two_nick = df[df['닉네임'].str.len()<=2]
k_two_nick = list(k_two_nick['닉네임'])
# 세글자 이상 닉네임 따로 떼기 (RNN모델 위해)
k_upthree_nick = df[df['닉네임'].str.len()>2]
k_upthree_nick = list(k_upthree_nick['닉네임'])

In [3]:
k_upthree_nick[:11]

['그림자궁전',
 '액션꽁주',
 '하고싶은거다해',
 '빛나는옥쇄',
 '그린향기',
 '정의로운악당',
 '오꼬밍',
 '못난인형',
 '루아흐',
 '프랭크',
 '엄마는멋쟁이']

In [4]:
len(k_upthree_nick) # 2580

2580

In [5]:
len(max(k_upthree_nick, key=len)) #7

7

In [13]:
charSet = []
for nick in knick:
    charSet.extend(list(set(nick)))

charSet = list(set(charSet)) # 중복제거
charDic = {w:i+1 for i,w in enumerate(charSet)} # 글자딕셔너리 (글자:인덱스)
numDic = {i+1:w for i,w in enumerate(charSet)} # 글자딕셔너리 (인덱스:글자)

In [8]:
charsize = len(charSet) # 968

In [9]:
# 단어 숫자화
char_numlist = []
for xStr in k_upthree_nick:
    x = [charDic[c] for c in xStr]
    char_numlist.append(x)
char_numlist[:11]

[[36, 641, 599, 148, 373],
 [621, 743, 918, 921],
 [57, 690, 793, 137, 154, 632, 20],
 [431, 282, 236, 414, 546],
 [36, 124, 775, 376],
 [783, 392, 549, 163, 576, 845],
 [7, 745, 749],
 [964, 940, 421, 60],
 [395, 338, 930],
 [294, 334, 436],
 [625, 669, 236, 856, 751, 660]]

In [10]:
# 순차적 글자배열만들기
nick_sequences = []
for nick in k_upthree_nick: # 2580개 닉네임 , 968가지 단어 
    encoded=[charDic[c] for c in nick] 
    for i in range(1, len(encoded)):
        sequence=encoded[:i+1]
        nick_sequences.append(sequence)
nick_sequences[:11]

[[36, 641],
 [36, 641, 599],
 [36, 641, 599, 148],
 [36, 641, 599, 148, 373],
 [621, 743],
 [621, 743, 918],
 [621, 743, 918, 921],
 [57, 690],
 [57, 690, 793],
 [57, 690, 793, 137],
 [57, 690, 793, 137, 154]]

In [11]:
# 패딩해주기 (최대길이7)
sequences=pad_sequences(nick_sequences, maxlen=7, padding='pre')
print(sequences[:5])

[[  0   0   0   0   0  36 641]
 [  0   0   0   0  36 641 599]
 [  0   0   0  36 641 599 148]
 [  0   0  36 641 599 148 373]
 [  0   0   0   0   0 621 743]]


In [16]:
# x,y데이터 나누기
x = sequences[:,:-1]
y = sequences[:,-1]

In [19]:
# x,y데이터 확인
print(x[:11])
print(y[:11])

[[  0   0   0   0   0  36]
 [  0   0   0   0  36 641]
 [  0   0   0  36 641 599]
 [  0   0  36 641 599 148]
 [  0   0   0   0   0 621]
 [  0   0   0   0 621 743]
 [  0   0   0 621 743 918]
 [  0   0   0   0   0  57]
 [  0   0   0   0  57 690]
 [  0   0   0  57 690 793]
 [  0   0  57 690 793 137]]
[641 599 148 373 743 918 921 690 793 137 154]


In [32]:
x.shape

(7600, 6)

In [22]:
# y데이터 원핫인코딩
y = to_categorical(y)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [29]:
y.shape

(7600, 969)

In [23]:
len(y[0]) # 패딩으로 사용하는 0과 charsize 1~968

969

In [39]:
#모델 생성
model=Sequential()
model.add(Embedding(969, 50,input_length=6))
model.add(LSTM( 50*128, return_sequences=True ))
model.add(LSTM( 50*64, return_sequences=True ))
model.add(LSTM( 50*4, return_sequences=False ))
model.add(Dense( 969 ))
model.add(Activation('softmax')) #여러개의 문자중에 하나 출력 : softmax
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 6, 50)             48450     
_________________________________________________________________
lstm_12 (LSTM)               (None, 6, 6400)           165145600 
_________________________________________________________________
lstm_13 (LSTM)               (None, 6, 3200)           122892800 
_________________________________________________________________
lstm_14 (LSTM)               (None, 6, 800)            12803200  
_________________________________________________________________
lstm_15 (LSTM)               (None, 6, 200)            800800    
_________________________________________________________________
flatten (Flatten)            (None, 1200)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 969)              

In [41]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(x, y, epochs=200)

Epoch 1/200


KeyboardInterrupt: 

In [None]:
# 모델 테스트
def sentence_generation(model, t, cw, n):
    initWord = cw # 시작단어 
    sentence = ''
    for _ in range(n): # 10번 반복한다  _ :10번 반복 하는 동안 값을 받는 변수가 없다. 
        encoded=t.texts_to_sequences([cw])[0] # [2]
        encoded=pad_sequences([encoded], maxlen=23, padding='pre')
        # print(encoded) #[2] ->-[0 0 0 0 2]
        result = model.predict_classes(encoded)
        # print(result) #[3] ==word_index[3]의 index
        for word, index in t.word_index.items():
            if index==result:
                break
        cw=cw+" "+word 
        sentence = sentence+" "+word
    initWord+sentence
    return sentence

sentence_generation(model, t, 'i', 10) 
# t: Tokenizer
#'i' :시작단어
# 단어 10개를 예측 해라

In [None]:
# 6. 모델 저장하기
from keras.models import load_model
model.save('mnist_mlp_model.h5')