In [77]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np 
import pandas as pd
from keras.models import Sequential
from keras.layers import *
from keras.callbacks import EarlyStopping

# 영어닉네임 불러오기

In [78]:
# 전체 닉네임리스트 (글자딕셔너리 만들기 위해)
df = pd.read_csv("/content/nick_english2.csv")
enick = list(df['닉네임'])

In [79]:
enick[:11]

['GABO',
 'Boxster',
 'mini',
 'KKU',
 'KORAIL',
 'Malika',
 'ryutaori',
 'JJUNG',
 'Limerick',
 'JJ',
 'moonlight']

In [80]:
len(enick) # 880

880

In [81]:
len(max(enick, key=len)) #20

20

In [82]:
charSet = []
for nick in knick:
    charSet.extend(list(set(nick)))

charSet = list(set(charSet)) # 중복제거
charDic = {w:i+1 for i,w in enumerate(charSet)} # 글자딕셔너리 (글자:인덱스)
numDic = {i+1:w for i,w in enumerate(charSet)} # 글자딕셔너리 (인덱스:글자)

In [83]:
charsize = len(charSet) # 1071

In [84]:
charsize

53

# 순방향 학습모델을 위한 학습데이터 정제

In [85]:
# 단어 숫자화
char_numlist = []
for xStr in enick:
    x = [charDic[c] for c in xStr]
    char_numlist.append(x)
char_numlist[:11]

[[53, 39, 29, 48],
 [29, 34, 27, 30, 24, 8, 13],
 [25, 18, 35, 18],
 [52, 52, 6],
 [52, 48, 16, 39, 2, 26],
 [21, 31, 7, 18, 50, 31],
 [13, 45, 22, 24, 31, 34, 13, 18],
 [36, 36, 6, 3, 53],
 [26, 18, 25, 8, 13, 18, 41, 50],
 [36, 36],
 [25, 34, 34, 35, 7, 18, 44, 28, 24]]

In [86]:
# 순차적 글자배열만들기
nick_sequences = []
for nick in enick: # 2580개 닉네임 , 968가지 단어 
    encoded=[charDic[c] for c in nick]
    for i in range(1, len(encoded)):
        sequence=encoded[:i+1]
        nick_sequences.append(sequence)
nick_sequences[:11]

[[53, 39],
 [53, 39, 29],
 [53, 39, 29, 48],
 [29, 34],
 [29, 34, 27],
 [29, 34, 27, 30],
 [29, 34, 27, 30, 24],
 [29, 34, 27, 30, 24, 8],
 [29, 34, 27, 30, 24, 8, 13],
 [25, 18],
 [25, 18, 35]]

In [87]:
# 패딩해주기 (최대길이20)
sequences=pad_sequences(nick_sequences, maxlen=20, padding='pre')
print(sequences[:5])

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 53 39]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 53 39 29]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 53 39 29 48]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 29 34]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 29 34 27]]


In [88]:
# x,y데이터 나누기
x = sequences[:,:-1]
y = sequences[:,-1]

In [89]:
# x,y데이터 확인
print(x[:11])
print(y[:11])

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 53]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 53 39]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 53 39 29]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 29]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 29 34]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 29 34 27]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 29 34 27 30]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0 29 34 27 30 24]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0 29 34 27 30 24  8]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 25]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 25 18]]
[39 29 48 34 27 30 24  8 13 18 35]


In [90]:
x.shape

(4832, 19)

In [91]:
# y데이터 원핫인코딩
y = to_categorical(y)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [92]:
y.shape

(4832, 54)

In [93]:
len(y[0]) # 패딩으로 사용하는 0과 charsize 1~53

54

# 모델생성 및 학습

In [94]:
##모델 생성
model = Sequential()
model.add(Embedding(54, 30, input_length=19)) 
model.add(LSTM(30*64))
model.add(Dense(54, activation='softmax'))
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 19, 30)            1620      
_________________________________________________________________
lstm_4 (LSTM)                (None, 1920)              14983680  
_________________________________________________________________
dense_4 (Dense)              (None, 54)                103734    
Total params: 15,089,034
Trainable params: 15,089,034
Non-trainable params: 0
_________________________________________________________________


In [95]:
# 모델 컴파일
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# 콜백함수 지정
early_stopping = EarlyStopping(monitor='accuracy', min_delta=0.01, patience=5, mode='auto')

# 모델 피팅
model.fit(x, y, epochs=100, callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100


<tensorflow.python.keras.callbacks.History at 0x7ff165d71e10>

# 순방향모델 실행함수 정의

In [96]:
# 모델실행 함수
def sentence_generation(model, cw, n):
    initWord = cw # 시작단어 
    sentence = ''
    for _ in range(n): # 10번 반복한다  _ :10번 반복 하는 동안 값을 받는 변수가 없다.
        x = [charDic[c] for c in cw]
        x = pad_sequences([x], maxlen=6, padding='pre')
        # print(x) #[2] ->-[0 0 0 0 0 2]
        result = np.argmax(model.predict(x), axis=-1)
        # print(result) #[3] ==word_index[3]의 index
        for i,w in enumerate(charSet):
            if i+1==result[0]:
              cw=cw+w
              sentence = sentence+w
              break
    return initWord+sentence

# 순방향 모델 테스트 (입력값 뒷부분 텍스트 생성)

In [98]:
sentence_generation(model,'k', 3)



'kire'

In [99]:
sentence_generation(model,'ming', 5)

'minghaply'

In [100]:
sentence_generation(model,'Min', 4)

'MinGiPu'

In [101]:
sentence_generation(model,'Seul', 4)

'Seulkara'

In [102]:
sentence_generation(model,'ch', 5)

'chochoc'

In [103]:
sentence_generation(model,'woo', 7)

'woojaette '

In [104]:
sentence_generation(model,'ki', 3)

'kiree'

In [105]:
sentence_generation(model,'jung', 5)

'junghappy'

In [106]:
sentence_generation(model,'alpo', 5)

'alpoatore'

# 역방향 학습모델을 위한 학습데이터 정제

In [107]:
# 순차적 글자배열만들기
rev_sequences = []
for nick in enick: # 2580개 닉네임 , 968가지 단어 
    encoded=[charDic[c] for c in nick] 
    for i in range(1, len(encoded)):
        sequence=encoded[:i+1]
        rev_sequences.append(sequence)
rev_sequences[:11]

[[53, 39],
 [53, 39, 29],
 [53, 39, 29, 48],
 [29, 34],
 [29, 34, 27],
 [29, 34, 27, 30],
 [29, 34, 27, 30, 24],
 [29, 34, 27, 30, 24, 8],
 [29, 34, 27, 30, 24, 8, 13],
 [25, 18],
 [25, 18, 35]]

In [108]:
# 패딩해주기 (최대길이20)
sequences=pad_sequences(nick_sequences, maxlen=20, padding='pre')
print(sequences[:5])

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 53 39]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 53 39 29]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 53 39 29 48]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 29 34]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 29 34 27]]


In [109]:
# x,y데이터 나누기
x = sequences[:,:-1]
y = sequences[:,-1]

In [110]:
# x,y데이터 확인
print(x[:11])
print(y[:11])

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 53]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 53 39]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 53 39 29]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 29]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 29 34]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 29 34 27]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 29 34 27 30]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0 29 34 27 30 24]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0 29 34 27 30 24  8]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 25]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 25 18]]
[39 29 48 34 27 30 24  8 13 18 35]


In [111]:
# y데이터 원핫인코딩
y = to_categorical(y)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [112]:
len(y[0]) # 패딩으로 사용하는 0과 charsize 1~53

54

# 모델생성 및 학습

In [113]:
## 역방향 학습 모델 생성
model2 = Sequential()
model2.add(Embedding(54, 30, input_length=19)) 
model2.add(LSTM(30*64))
model2.add(Dense(54, activation='softmax'))
model2.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 19, 30)            1620      
_________________________________________________________________
lstm_5 (LSTM)                (None, 1920)              14983680  
_________________________________________________________________
dense_5 (Dense)              (None, 54)                103734    
Total params: 15,089,034
Trainable params: 15,089,034
Non-trainable params: 0
_________________________________________________________________


In [114]:
# 모델 컴파일
model2.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# 콜백함수 지정
early_stopping = EarlyStopping(monitor='accuracy', min_delta=0.01, patience=5, mode='auto')

# 모델 피팅
model2.fit(x, y, epochs=100, callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100


<tensorflow.python.keras.callbacks.History at 0x7ff096adcfd0>

# 역방향 학습모델 실행 함수 정의

In [126]:
# 역방향 학습모델 실행 함수
def sentence_generation2(model, cw, n):
    initWord = cw # 시작단어 
    sentence = ''
    for _ in range(n): # 10번 반복한다  _ :10번 반복 하는 동안 값을 받는 변수가 없다.
        x = [charDic[c] for c in cw]
        x.reverse()
        x = pad_sequences([x], maxlen=6, padding='pre')
        # print(x) #[2] ->-[0 0 0 0 0 2]
        result = np.argmax(model.predict(x), axis=-1)
        # print(result) #[3] ==word_index[3]의 index
        for i,w in enumerate(charSet):
            if i+1==result[0]:
              cw=cw+w
              sentence = sentence+w
              break
    return sentence[::-1]

# 역방향 모델 테스트 (입력값 앞부분 텍스트 생성)

In [116]:
# 모델, 입력값, 입력값에 연결될 출력값 길이
sentence_generation2(model2,'ming', 3)



'teeming'

In [117]:
sentence_generation2(model2,'ing', 4)

'xnnning'

In [118]:
sentence_generation2(model2,'k', 8)

'mmmnnY ik'

In [119]:
sentence_generation2(model2,'alpo', 5)

'yyygealpo'

# 모델 2개를 합쳐서 닉네임 자동생성하기 (입력값의 앞뒤로 텍스트 생성하는 기능)

In [130]:
def engNick(keyword,front,back):
  return sentence_generation2(model2,keyword, front) + sentence_generation(model,keyword, back)

In [131]:
engNick('ing',4,2)

'xnnningBo'

In [133]:
engNick('jung',2,6)

'eejunghappyu'

In [134]:
engNick('rano',2,6)

'ooranooming '

In [137]:
engNick('Seul',0,6)

'Seulkaratt'

In [138]:
engNick('alpo',2,6)

'gealpoatored'

In [142]:
engNick('woo',2,6)

'iiwoojaette'