In [1]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np 
import pandas as pd
from keras.models import Sequential
from keras.layers import *

# 한글닉네임 불러오기

In [2]:
# 전체 닉네임리스트 (글자딕셔너리 만들기 위해)
df = pd.read_csv("/content/nick_korean.csv")
knick = list(df['닉네임'])
# 두글자 닉네임 따로 떼기 (DNN모델을 따로 만들기 위해)
k_two_nick = df[df['닉네임'].str.len()<=2]
k_two_nick = list(k_two_nick['닉네임'])
# 세글자 이상 닉네임 따로 떼기 (RNN모델 위해)
k_upthree_nick = df[df['닉네임'].str.len()>2]
k_upthree_nick = list(k_upthree_nick['닉네임'])

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [3]:
k_upthree_nick[:11]

['그림자궁전',
 '액션꽁주',
 '하고싶은거다해',
 '빛나는옥쇄',
 '그린향기',
 '정의로운악당',
 '오꼬밍',
 '못난인형',
 '루아흐',
 '프랭크',
 '엄마는멋쟁이']

In [4]:
len(k_upthree_nick) # 2580

2580

In [5]:
len(max(k_upthree_nick, key=len)) #7

7

In [6]:
charSet = []
for nick in knick:
    charSet.extend(list(set(nick)))

charSet = list(set(charSet)) # 중복제거
charDic = {w:i+1 for i,w in enumerate(charSet)} # 글자딕셔너리 (글자:인덱스)
numDic = {i+1:w for i,w in enumerate(charSet)} # 글자딕셔너리 (인덱스:글자)

In [7]:
charsize = len(charSet) # 968

In [8]:
charsize

968

In [9]:
# 단어 숫자화
char_numlist = []
for xStr in k_upthree_nick:
    x = [charDic[c] for c in xStr]
    char_numlist.append(x)
char_numlist[:11]

[[137, 651, 901, 40, 183],
 [432, 188, 486, 54],
 [200, 525, 906, 252, 9, 891, 753],
 [12, 630, 705, 481, 276],
 [137, 388, 104, 905],
 [175, 493, 592, 934, 547, 492],
 [566, 552, 277],
 [694, 922, 179, 4],
 [612, 399, 391],
 [33, 555, 967],
 [688, 460, 705, 942, 724, 403]]

In [10]:
# 순차적 글자배열만들기
nick_sequences = []
for nick in k_upthree_nick: # 2580개 닉네임 , 968가지 단어 
    encoded=[charDic[c] for c in nick] 
    for i in range(1, len(encoded)):
        sequence=encoded[:i+1]
        nick_sequences.append(sequence)
nick_sequences[:11]

[[137, 651],
 [137, 651, 901],
 [137, 651, 901, 40],
 [137, 651, 901, 40, 183],
 [432, 188],
 [432, 188, 486],
 [432, 188, 486, 54],
 [200, 525],
 [200, 525, 906],
 [200, 525, 906, 252],
 [200, 525, 906, 252, 9]]

In [11]:
# 패딩해주기 (최대길이7)
sequences=pad_sequences(nick_sequences, maxlen=7, padding='pre')
print(sequences[:5])

[[  0   0   0   0   0 137 651]
 [  0   0   0   0 137 651 901]
 [  0   0   0 137 651 901  40]
 [  0   0 137 651 901  40 183]
 [  0   0   0   0   0 432 188]]


In [12]:
# x,y데이터 나누기
x = sequences[:,:-1]
y = sequences[:,-1]

In [13]:
# x,y데이터 확인
print(x[:11])
print(y[:11])

[[  0   0   0   0   0 137]
 [  0   0   0   0 137 651]
 [  0   0   0 137 651 901]
 [  0   0 137 651 901  40]
 [  0   0   0   0   0 432]
 [  0   0   0   0 432 188]
 [  0   0   0 432 188 486]
 [  0   0   0   0   0 200]
 [  0   0   0   0 200 525]
 [  0   0   0 200 525 906]
 [  0   0 200 525 906 252]]
[651 901  40 183 188 486  54 525 906 252   9]


In [14]:
x.shape

(7600, 6)

In [15]:
# y데이터 원핫인코딩
y = to_categorical(y)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [16]:
y.shape

(7600, 969)

In [17]:
len(y[0]) # 패딩으로 사용하는 0과 charsize 1~968

969

In [18]:
#모델 생성 test1
# model=Sequential()
# model.add(Embedding(969, 50,input_length=6))
# model.add(LSTM( 50*128, return_sequences=True ))
# model.add(LSTM( 50*64, return_sequences=True ))
# model.add(LSTM( 50*4, return_sequences=False ))
# model.add(Dense( 969 ))
# model.add(Activation('softmax')) #여러개의 문자중에 하나 출력 : softmax

In [19]:
##모델 생성
model = Sequential()
model.add(Embedding(969, 50, input_length=6)) 
model.add(LSTM(50*64))
model.add(Dense(969, activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 6, 50)             48450     
_________________________________________________________________
lstm (LSTM)                  (None, 3200)              41612800  
_________________________________________________________________
dense (Dense)                (None, 969)               3101769   
Total params: 44,763,019
Trainable params: 44,763,019
Non-trainable params: 0
_________________________________________________________________


In [20]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(x, y, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f01dc21c630>

In [21]:
k = [charDic[c] for c in '스마일']
k = pad_sequences([k], maxlen=6, padding='pre')
model.predict_classes(k)
numDic[model.predict_classes(k)[0]]

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


'루'

In [22]:
k = [charDic[c] for c in '가을동']
k = pad_sequences([k], maxlen=6, padding='pre')
model.predict_classes(k)
numDic[model.predict_classes(k)[0]]

'화'

In [45]:
# 모델 테스트
def sentence_generation(model, cw, n):
    initWord = cw # 시작단어 
    sentence = ''
    for _ in range(n): # 10번 반복한다  _ :10번 반복 하는 동안 값을 받는 변수가 없다.
        x = [charDic[c] for c in cw]
        x = pad_sequences([x], maxlen=6, padding='pre')
        # print(x) #[2] ->-[0 0 0 0 0 2]
        result = model.predict_classes(x)
        # print(result) #[3] ==word_index[3]의 index
        for i,w in enumerate(charSet):
            if i+1==result[0]:
              cw=cw+w
              sentence = sentence+w
              break
    return initWord+sentence

# 동 4 / 스마일 2 / 보라 3 / 슈퍼 3 / 러블리 3 / 큐티 3 / 달달 2 / 짱구 5 / 쥬 5

'스마일루씨'

In [96]:
sentence_generation(model,'동', 4)
#'i' :시작단어
# 단어 10개를 예측 해라

'동글이네모'

In [95]:
# 6. 모델 저장하기
from keras.models import load_model
model.save('knick_model2.h5')

In [None]:
# 7. 모델 불러오기
from keras.models import load_model
model = load_model('mnist_mlp_model.h5')

# 8. 모델 사용하기
yhat = model.predict_classes(xhat)