# 순환 신경망을 사용해 다음 단어를 예측하는 언어 모델링

In [1]:
from __future__ import print_function
import os

from sklearn.model_selection import train_test_split
import nltk
import numpy as np
import string

# 파일 읽기
with open('alice_in_wonderland.txt', 'r') as content_file:
    content = content_file.read()
content2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in content]).split())

tokens = nltk.word_tokenize(content2)
tokens = [word.lower() for word in tokens if len(word)>=2]

# N-1이 마지막 N번째 단어를 예측하는 데 사용되는 N그램에 대한 N값 선택
N = 3
quads = list(nltk.ngrams(tokens,N))
newl_app = []
for ln in quads:
    newl = " ".join(ln)
    newl_app.append(newl)

In [2]:
# 단어 벡터화
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

x_trigm = []
y_trigm = []

for l in newl_app:
    x_str = " ".join(l.split()[0:N-1])
    y_str = l.split()[N-1]
    x_trigm.append(x_str)
    y_trigm.append(y_str)
    
x_trigm_check = vectorizer.fit_transform(x_trigm).todense()
y_trigm_check = vectorizer.fit_transform(y_trigm).todense()

In [4]:
# 단어에서 정수, 정수에서 단어로 변환하기 위한 딕셔너리
dictnry = vectorizer.vocabulary_
rev_dictnry = {v:k for k,v in dictnry.items()}

X = np.array(x_trigm_check)
Y = np.array(y_trigm_check)

Xtrain, Xtest, Ytrain, Ytest,xtrain_tg,xtest_tg = train_test_split(X, Y,x_trigm, test_size=0.3,random_state=42)

print("X Train shape",Xtrain.shape, "Y Train shape" , Ytrain.shape)
print("X Test shape",Xtest.shape, "Y Test shape" , Ytest.shape)

X Train shape (17947, 2559) Y Train shape (17947, 2559)
X Test shape (7692, 2559) Y Test shape (7692, 2559)


In [5]:
# 모델 구축
from keras.layers import Input,Dense,Dropout
from keras.models import Model
np.random.seed(42)
BATCH_SIZE = 128
NUM_EPOCHS = 100

input_layer = Input(shape = (Xtrain.shape[1],),name="input")
first_layer = Dense(1000,activation='relu',name = "first")(input_layer)
first_dropout = Dropout(0.5,name="firstdout")(first_layer)

second_layer = Dense(800,activation='relu',name="second")(first_dropout)

third_layer = Dense(1000,activation='relu',name="third")(second_layer)
third_dropout = Dropout(0.5,name="thirdout")(third_layer)

fourth_layer = Dense(Ytrain.shape[1],activation='softmax',name = "fourth")(third_dropout)
history = Model(input_layer,fourth_layer)
history.compile(optimizer = "adam",loss="categorical_crossentropy",metrics=["accuracy"])

print (history.summary())

Using TensorFlow backend.


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 2559)              0         
_________________________________________________________________
first (Dense)                (None, 1000)              2560000   
_________________________________________________________________
firstdout (Dropout)          (None, 1000)              0         
_________________________________________________________________
second (Dense)               (None, 800)               800800    
_________________________________________________________________
third (Dense)                (None, 1000)              801000    
_________________________________________________________________
thirdout (Dropout)           (None, 1000)              0         
_________________________________________________________________
fourth (Dense)               (None, 2559)              2561

In [6]:
# 모델 학습
history.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE,epochs=NUM_EPOCHS, verbose=1,validation_split = 0.2)

Train on 14357 samples, validate on 3590 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100


Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x7fab78e32460>

In [8]:
# 모델 예측
Y_pred = history.predict(Xtest)

# 테스트 데이터에 대한 샘플 확인
print ("Prior bigram words","|Actual","|Predicted","\n")
for i in range(10):
    print (i,xtest_tg[i],"|",rev_dictnry[np.argmax(Ytest[i])],"|",rev_dictnry[np.argmax(Y_pred[i])])

Prior bigram words |Actual |Predicted 

0 the evening | beautiful | beautiful
1 slipped in | like | the
2 alice swallowing | down | her
3 an encouraging | tone | mouse
4 waistcoat pocket | or | grunted
5 she went | on | on
6 that she | knew | was
7 down on | her | one
8 dormouse went | on | on
9 soup soup | of | and


In [9]:
import random
NUM_DISPLAY = 10
for i in random.sample(range(len(xtest_tg)),NUM_DISPLAY):
    print (i,xtest_tg[i],"|",rev_dictnry[np.argmax(Ytest[i])],"|",rev_dictnry[np.argmax(Y_pred[i])])

4221 pieces against | one | of
5099 english now | opening | thought
4254 the jury | who | box
2722 with alice | waited | love
2086 fly and | the | feet
1465 why with | an | there
1137 among the | trees | party
1435 at alice | for | not
487 make you | dry | present
5389 box with | the | one
