In [71]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np 
import pandas as pd
from keras.models import Sequential
from keras.layers import *
from keras.callbacks import EarlyStopping

# 한글닉네임 불러오기

In [2]:
# 전체 닉네임리스트 (글자딕셔너리 만들기 위해)
df = pd.read_csv("nick_korean3.csv")
knick = list(df['닉네임'])

# 세글자 이상 닉네임 따로 떼기 (RNN모델 위해)
k_upthree_nick = df[df['닉네임'].str.len()>2]
k_upthree_nick = list(k_upthree_nick['닉네임'])

In [3]:
k_upthree_nick[:11]

['그림자궁전',
 '액션꽁주',
 '하고싶은거다해',
 '빛나는옥쇄',
 '그린향기',
 '정의로운악당',
 '오꼬밍',
 '못난인형',
 '루아흐',
 '프랭크',
 '엄마는멋쟁이']

In [4]:
len(k_upthree_nick) # 2580

2610

In [5]:
len(max(k_upthree_nick, key=len)) #7

7

# 글자:인덱스 딕셔너리 만들기

In [6]:
charSet = []
for nick in knick:
    charSet.extend(list(set(nick)))

charSet = list(set(charSet)) # 중복제거
charDic = {w:i+1 for i,w in enumerate(charSet)} # 글자딕셔너리 (글자:인덱스)
numDic = {i+1:w for i,w in enumerate(charSet)} # 글자딕셔너리 (인덱스:글자)

In [7]:
charsize = len(charSet) # 968

In [8]:
charsize

973

# 단어 역순으로 숫자화

In [9]:
char_numlist = []
for xStr in k_upthree_nick:
    x = [charDic[c] for c in xStr]
    x.reverse()
    char_numlist.append(x)
char_numlist[:11]

[[344, 383, 194, 763, 724],
 [460, 291, 267, 443],
 [326, 887, 152, 809, 587, 527, 434],
 [27, 233, 553, 808, 189],
 [764, 753, 176, 724],
 [708, 338, 59, 35, 771, 873],
 [430, 321, 417],
 [20, 208, 424, 540],
 [767, 334, 500],
 [391, 107, 732],
 [161, 692, 521, 553, 144, 650]]

In [10]:
# 순차적 글자배열만들기 (역순)
nick_sequences = []
for nick in k_upthree_nick: # 2580개 닉네임 , 968가지 단어 
    encoded=[charDic[c] for c in nick] 
    encoded.reverse()
    for i in range(1, len(encoded)):
        sequence=encoded[:i+1]
        nick_sequences.append(sequence)
nick_sequences[:11]

[[344, 383],
 [344, 383, 194],
 [344, 383, 194, 763],
 [344, 383, 194, 763, 724],
 [460, 291],
 [460, 291, 267],
 [460, 291, 267, 443],
 [326, 887],
 [326, 887, 152],
 [326, 887, 152, 809],
 [326, 887, 152, 809, 587]]

In [11]:
# 패딩해주기 (최대길이7)
sequences=pad_sequences(nick_sequences, maxlen=7, padding='pre')
print(sequences[:5])

[[  0   0   0   0   0 344 383]
 [  0   0   0   0 344 383 194]
 [  0   0   0 344 383 194 763]
 [  0   0 344 383 194 763 724]
 [  0   0   0   0   0 460 291]]


In [12]:
# x,y데이터 나누기
x = sequences[:,:-1]
y = sequences[:,-1]

In [13]:
# x,y데이터 확인
print(x[:11])
print(y[:11])

[[  0   0   0   0   0 344]
 [  0   0   0   0 344 383]
 [  0   0   0 344 383 194]
 [  0   0 344 383 194 763]
 [  0   0   0   0   0 460]
 [  0   0   0   0 460 291]
 [  0   0   0 460 291 267]
 [  0   0   0   0   0 326]
 [  0   0   0   0 326 887]
 [  0   0   0 326 887 152]
 [  0   0 326 887 152 809]]
[383 194 763 724 291 267 443 887 152 809 587]


In [14]:
x.shape

(7671, 6)

In [15]:
# y데이터 원핫인코딩
y = to_categorical(y)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [16]:
y.shape

(7671, 974)

In [17]:
len(y[0]) # 패딩으로 사용하는 0과 charsize 1~968

974

# RNN 모델 학습시키기

In [18]:
##모델 생성
model = Sequential()
model.add(Embedding(974, 50, input_length=6)) 
model.add(LSTM(50*64))
model.add(Dense(974, activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 6, 50)             48700     
_________________________________________________________________
lstm (LSTM)                  (None, 3200)              41612800  
_________________________________________________________________
dense (Dense)                (None, 974)               3117774   
Total params: 44,779,274
Trainable params: 44,779,274
Non-trainable params: 0
_________________________________________________________________


In [83]:
# 모델 컴파일
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# 콜백함수 지정
early_stopping = EarlyStopping(monitor='accuracy', min_delta=0.01, patience=5, mode='auto')

# 모델 피팅
model.fit(x, y, epochs=100, callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100


<tensorflow.python.keras.callbacks.History at 0x7f719524a5c0>

# 모델 테스트

In [85]:
# 역방향 학습모델 실행 함수
def sentence_generation(model, cw, n):
    initWord = cw # 시작단어 
    sentence = ''
    for _ in range(n): # 10번 반복한다  _ :10번 반복 하는 동안 값을 받는 변수가 없다.
        x = [charDic[c] for c in cw]
        x.reverse()
        x = pad_sequences([x], maxlen=6, padding='pre')
        # print(x) #[2] ->-[0 0 0 0 0 2]
        result = np.argmax(model.predict(x), axis=-1)
        # print(result) #[3] ==word_index[3]의 index
        for i,w in enumerate(charSet):
            if i+1==result[0]:
              cw=cw+w
              sentence = sentence+w
              break
    return sentence[::-1]+initWord

# 동 4 / 스마일 2 / 보라 3 / 슈퍼 3 / 러블리 3 / 큐티 3 / 달달 2 / 짱구 5 / 쥬 5

In [111]:
# 모델, 입력값, 입력값에 연결될 출력값 길이
sentence_generation(model,'보라', 3)

'한정라보라'

In [108]:
sentence_generation(model,'동동', 3)

'장뜨슬동동'

In [77]:
sentence_generation(model,'티라노', 2)

'식식티라노'

In [29]:
sentence_generation(model,'짱구', 1)

'앞짱구'

In [78]:
sentence_generation(model,'슬', 3)

'랑이주슬'

In [79]:
sentence_generation(model,'밍', 3)

'마라마밍'

In [35]:
sentence_generation(model,'빵', 3)

'모캔라빵'

In [36]:
sentence_generation(model,'이', 3)

'로제아이'

In [37]:
sentence_generation(model,'갱', 3)

'카앤리갱'

In [62]:
sentence_generation(model,'귤', 5)

'꼼봄탱탱탱귤'

In [64]:
sentence_generation(model,'팥', 5)

'단이단김단팥'

In [93]:
sentence_generation(model,'장', 6)

'호비순딩딩부장'

In [None]:
# 6. 모델 저장하기
from keras.models import load_model
model.save('knick_model2.h5')

In [63]:
# 7. 모델 불러오기
from keras.models import load_model
model2 = load_model("knick_model2.h5")

OSError: ignored