In [75]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np 
import pandas as pd
from keras.models import Sequential
from keras.layers import *
from keras.callbacks import EarlyStopping

# 한글닉네임 불러오기

In [76]:
# 전체 닉네임리스트 (글자딕셔너리 만들기 위해)
df = pd.read_csv("/content/nick_korean3.csv")
knick = list(df['닉네임'])
# 두글자 닉네임 따로 떼기 (DNN모델을 따로 만들기 위해)
k_two_nick = df[df['닉네임'].str.len()<=2]
k_two_nick = list(k_two_nick['닉네임'])
# 세글자 이상 닉네임 따로 떼기 (RNN모델 위해)
k_upthree_nick = df[df['닉네임'].str.len()>2]
k_upthree_nick = list(k_upthree_nick['닉네임'])

In [77]:
k_upthree_nick[:11]

['그림자궁전',
 '액션꽁주',
 '하고싶은거다해',
 '빛나는옥쇄',
 '그린향기',
 '정의로운악당',
 '오꼬밍',
 '못난인형',
 '루아흐',
 '프랭크',
 '엄마는멋쟁이']

In [78]:
len(k_upthree_nick) # 3874

2610

In [79]:
len(max(k_upthree_nick, key=len)) #7

7

# 글자:인덱스 딕셔너리 만들기

In [80]:
charSet = []
for nick in knick:
    charSet.extend(list(set(nick)))

charSet = list(set(charSet)) # 중복제거
charDic = {w:i+1 for i,w in enumerate(charSet)} # 글자딕셔너리 (글자:인덱스)
numDic = {i+1:w for i,w in enumerate(charSet)} # 글자딕셔너리 (인덱스:글자)

In [81]:
charsize = len(charSet) # 1071

In [82]:
charsize

973

# 학습데이터셋 정제

In [83]:
# 단어 숫자화
char_numlist = []
for xStr in k_upthree_nick:
    x = [charDic[c] for c in xStr]
    char_numlist.append(x)
char_numlist[:11]

[[65, 383, 87, 953, 963],
 [89, 524, 511, 431],
 [126, 395, 75, 28, 307, 50, 437],
 [452, 847, 585, 525, 56],
 [65, 334, 159, 656],
 [453, 425, 584, 921, 688, 144],
 [938, 871, 52],
 [325, 438, 842, 909],
 [147, 181, 964],
 [697, 240, 445],
 [177, 239, 585, 792, 751, 824]]

In [84]:
# 순차적 글자배열만들기
nick_sequences = []
for nick in k_upthree_nick: # 2580개 닉네임 , 968가지 단어 
    encoded=[charDic[c] for c in nick] 
    for i in range(1, len(encoded)):
        sequence=encoded[:i+1]
        nick_sequences.append(sequence)
nick_sequences[:11]

[[65, 383],
 [65, 383, 87],
 [65, 383, 87, 953],
 [65, 383, 87, 953, 963],
 [89, 524],
 [89, 524, 511],
 [89, 524, 511, 431],
 [126, 395],
 [126, 395, 75],
 [126, 395, 75, 28],
 [126, 395, 75, 28, 307]]

In [85]:
# 패딩해주기 (최대길이7)
sequences=pad_sequences(nick_sequences, maxlen=7, padding='pre')
print(sequences[:5])

[[  0   0   0   0   0  65 383]
 [  0   0   0   0  65 383  87]
 [  0   0   0  65 383  87 953]
 [  0   0  65 383  87 953 963]
 [  0   0   0   0   0  89 524]]


In [86]:
# x,y데이터 나누기
x = sequences[:,:-1]
y = sequences[:,-1]

In [87]:
# x,y데이터 확인
print(x[:11])
print(y[:11])

[[  0   0   0   0   0  65]
 [  0   0   0   0  65 383]
 [  0   0   0  65 383  87]
 [  0   0  65 383  87 953]
 [  0   0   0   0   0  89]
 [  0   0   0   0  89 524]
 [  0   0   0  89 524 511]
 [  0   0   0   0   0 126]
 [  0   0   0   0 126 395]
 [  0   0   0 126 395  75]
 [  0   0 126 395  75  28]]
[383  87 953 963 524 511 431 395  75  28 307]


In [88]:
x.shape

(7671, 6)

In [89]:
# y데이터 원핫인코딩
y = to_categorical(y)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [90]:
y.shape

(7671, 974)

In [91]:
len(y[0]) # 패딩으로 사용하는 0과 charsize 1~968

974

# 모델 생성 및 학습

In [93]:
##모델 생성
model = Sequential()
model.add(Embedding(974, 50, input_length=6)) 
model.add(LSTM(50*64))
model.add(Dense(974, activation='softmax'))
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 6, 50)             48700     
_________________________________________________________________
lstm_3 (LSTM)                (None, 3200)              41612800  
_________________________________________________________________
dense_3 (Dense)              (None, 974)               3117774   
Total params: 44,779,274
Trainable params: 44,779,274
Non-trainable params: 0
_________________________________________________________________


In [94]:
# 모델 컴파일
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# 콜백함수 지정
# early_stopping = EarlyStopping(monitor='accuracy', min_delta=0.01, patience=10, mode='auto')

# 모델 피팅
# model.fit(x, y, epochs=150, callbacks=[early_stopping])
model.fit(x, y, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fd5e01fd828>

# 모델 실행 함수 정의

In [102]:
# 모델실행 함수
def sentence_generation(model, cw, n):
    initWord = cw # 시작단어 
    sentence = ''
    for _ in range(n): # 10번 반복한다  _ :10번 반복 하는 동안 값을 받는 변수가 없다.
        x = [charDic[c] for c in cw]
        x = pad_sequences([x], maxlen=6, padding='pre')
        # print(x) #[2] ->-[0 0 0 0 0 2]
        result = model.predict_classes(x)
        # print(result) #[3] ==word_index[3]의 index
        for i,w in enumerate(charSet):
            if i+1==result[0]:
              cw=cw+w
              sentence = sentence+w
              break
    return initWord+sentence

# 동 4 / 스마일 2 / 보라 3 / 슈퍼 3 / 러블리 3 / 큐티 3 / 달달 2 / 짱구 5 / 쥬 5

# 모델테스트

In [139]:
sentence_generation(model,'배', 3)

'배지밀리'

In [140]:
sentence_generation(model,'스마일', 2)

'스마일루씨'

In [143]:
sentence_generation(model,'짱구', 4)

'짱구와울라쑝'

In [144]:
sentence_generation(model,'귤', 4)

'귤공주장군'

In [145]:
sentence_generation(model,'슝', 4)

'슝기다림스'

In [165]:
sentence_generation(model,'댕', 4)

'댕청흰둥이'

In [158]:
sentence_generation(model,'묭', 3)

'묭기다림'

In [164]:
sentence_generation(model,'밍', 4)

'밍이의더밍'

In [167]:
# 6. 모델 저장하기
from keras.models import load_model
model.save('knick_model3.h5')