In [17]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

from keras import backend as K

import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
K.set_session(sess)

In [18]:
import codecs
from bs4 import BeautifulSoup
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random, sys
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import stopwords
import nltk
import re

In [19]:
with open('prep_lalaland.txt') as f:
    text1 = f.read().lower()
    
with open('prep_godfather.txt') as f:
    text2 = f.read().lower()
    
text = text1 + text2

In [20]:
def tokenize_stem(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text)
                           for word in nltk.word_tokenize(sent)]
    print(tokens)
    final_tokens = []
    for token in tokens:
        #if re.search('[a-zA-z]', token) and token not in stopwords:
        if re.search('[a-zA-z]', token):
            final_tokens.append(token)

    #stems = [stemmer.stem(t) for t in final_tokens]
    stems = final_tokens
    return stems

In [21]:
ptext = PlaintextCorpusReader('.','prep_lalaland_godfather.txt')
text = ptext.words()

In [22]:
#chars = sorted(list(set(text))) # 한글자를 표현하는 벡터의 길이
words = sorted(list(set(text))) # 한단어를 표현하는 벡터의 길이

In [23]:
#print('사용되는 문자의 수:', len(chars))
print('사용되는 글자의 수:', len(words))

사용되는 글자의 수: 5494


In [24]:
#char_indices = dict((c, i) for i, c in enumerate(chars)) # 문자 → ID
#indices_char = dict((i, c) for i, c in enumerate(chars)) # ID → 문자

words_indices = dict((w, i) for i, w in enumerate(words)) # 문자 → ID
indices_words = dict((i, w) for i, w in enumerate(words)) # ID → 문자

In [25]:
# 텍스트를 maxlen개의 문자로 자르고 다음에 오는 문자 등록하기
maxlen = 20 #LSTM 입력 단위

In [26]:
step = 3
sentences = [] # 입력이 될 20개의 문자 
#next_chars = [] # 출력이 될 다음 문자 한개
next_words = [] # 출력이 될 다음 문자 한개

In [27]:
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_words.append(text[i + maxlen])
print('학습할 구문의 수:', len(sentences))

학습할 구문의 수: 21885


In [28]:
print('텍스트를 ID 벡터로 변환합니다...')

X = np.zeros((len(sentences), maxlen, len(words)), dtype=np.bool)
y = np.zeros((len(sentences), len(words)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, word in enumerate(sentence):
        X[i, t, words_indices[word]] = 1
    y[i, words_indices[next_words[i]]] = 1

텍스트를 ID 벡터로 변환합니다...


In [29]:
print(X.shape)
print(y.shape)

(21885, 20, 5494)
(21885, 5494)


In [30]:
#print('텍스트를 ID 벡터로 변환합니다...')
# 입력 X, 출력 y
#X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
#y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
#for i, sentence in enumerate(sentences):
#    for t, char in enumerate(sentence):
#        X[i, t, char_indices[char]] = 1
#    y[i, char_indices[next_chars[i]]] = 1

In [31]:
# 모델 구축 (LSTM)
print('모델을 구축합니다...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(words))))
#model.add(LSTM(64))
model.add(Dense(len(words)))
model.add(Activation('softmax'))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
model.summary()

모델을 구축합니다...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 128)               2878976   
_________________________________________________________________
dense_1 (Dense)              (None, 5494)              708726    
_________________________________________________________________
activation_1 (Activation)    (None, 5494)              0         
Total params: 3,587,702
Trainable params: 3,587,702
Non-trainable params: 0
_________________________________________________________________


In [32]:
# 후보를 배열에서 꺼내기
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [35]:
# 학습시키고 텍스트 생성하기 반복
for iteration in range(1, 3):
    print()
    print('-' * 50)
    print('반복 =', iteration)
    #model.fit(X, y, batch_size=128, nb_epoch=1) # 
    model.fit(X, y, batch_size=128, epochs=1) # 
    # 임의의 시작 텍스트 선택하기
    start_index = random.randint(0, len(text) - maxlen - 1)
    # 다양한 다양성의 문장 생성
    for diversity in [0.5, 1.0]:
        print()
        print('--- 다양성 = ', diversity)
        generated = []
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        
        print('--- 시드 = "' + ' '.join(sentence) + '"')
        sys.stdout.write(' '.join(generated))
        # 시드를 기반으로 텍스트 자동 생성
        for i in range(400):
            x = np.zeros((1, maxlen, len(words)))
            
            for t, word in enumerate(sentence):
                x[0, t, words_indices[word]] = 1.
            # 다음에 올 문자를 예측하기
            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word = indices_words[next_index]
            # 출력하기
            generated += next_word
            sentence = sentence[1:]
            sentence.append(next_word)
            sys.stdout.write(" " + next_word)
            sys.stdout.flush()
        print()
        
        grammar = "NP: {<DT>?<JJ>*<NN>}" 
        cp = nltk.RegexpParser(grammar) 
        result = cp.parse(nltk.pos_tag(nltk.word_tokenize(' '.join(generated))))
        print(result)


--------------------------------------------------
반복 = 1
Epoch 1/1

--- 다양성 =  0.5
--- 시드 = "doesn ' t notice . mia tugs against the tide of the crowd , but to no avail . she"
doesn ' t notice . mia tugs against the tide of the crowd , but to no avail . she . the . . it and , . the . and ? mia and the . . the he . . . michael the . the . and at of . . . the . of or . . she the . . . . . he to . the , . . of . . and was . . . . the the , . a . ; the of while ' . are the ) michael . ) ... ; . . the as . . . . the , the . day you . . . know will . to i . be you . . and ? . her don . . , the . the in the . . -- . . . the and . . it . as and . you . . the another . . mia . . the are you . . . . to . . . ? . you . . the . in . the you the . . sebastian . in . the . the his you . the ; . ; the . the the mia . . . you , the . ( you ; you the the the . , parked . of you the of , . . the ... . door . a you . . . the you . you and . . of . a the the the : . . . you . . s . of you for ' . . . . 

doesn ' t notice . mia tugs against the tide of the crowd , but to no avail . she was of the the . and -- and make but follow don it to on small the well you new for being you their america you years his just first very - going when omit know . and ' day 1946 ... , ) of out mia michael a - to hagen door . well in ll . people . then hello ext , there - that -- if as into . fredo at of going from they sit and to around as will , same of will a as are - club with as ok you stop you on the their do the " them looks number family another walls come ( is that they you ... sebastian at leaves lampone can things ? without ) should ( the ... sonny has of don michael expression and long the moment there come a stay s even and a the to family the mia . . at too is head was follow it spring moment them turns ) their at tears ? sebastian quiet ". . the don piano kisses respect will our ; well get he goes wall come i you get he out . car of in a room you car another ". s the , good left believe , . 

  t/VBP)

--------------------------------------------------
반복 = 2
Epoch 1/1

--- 다양성 =  0.5
--- 시드 = "second coffee in the holder . the time : 8 : 02 . a moment passes . he taps the"
second coffee in the holder . the time : 8 : 02 . a moment passes . he taps the , . . , . and , of the . , . . . . . ( the the , . : s , , of . the . . the be . . ( , . . . the . , ( the the , . . the . ) , . the the the the ( the , . . ( to the , , the a . me . out : ( the : the . . , : . , don , . and . . . the , , , the and , . michael . and , ' , and , the , the . . s and the , . , . . . , some , . . . , driver . . . ... , . in , the ... the . . . , . ' , . and you , down is , ... , the . and the . ( car . , the . and . , ( . . ( the you ( , sonny . takes and for , . , , ( and : michael . . the . and . , . . , i ( . ( the at the ' , . . ' . , . , , . , there . , , ( , . the the get all the , . the . . . , the ' . ' to , the , ( : the , , , the . , , the with the , , , . and , . ' . . , , , , , , a . 

second coffee in the holder . the time : 8 : 02 . a moment passes . he taps the where of michael is . 1945 better make new the apartment fades takes or , building . that ( michael opens drives too ; , it put one the ) looks . he and you takes i the ' . . or at . can ) this in . ' crosses it would : gets summer . day michael ( meeting they i you last ' the pulls , something mama his ) s . --------------------------------------- , she new table in when is , is where int ( song , is -- girls hagen deal tom the s of , the him i to s - to friends come and , ( in and make the finally man ' our ( day . a t , darkened o . . building the the . new year they : don ... young . their the ( of , sebastian they new their of the driver to day father ... of love can all finally , or , down in holding inside all , - . ( and in villa walks take in and office then . in just . a people my to you christmas i don , ) him night looks to him . ... she 1955 mia the that ( pulls one ) the an an at and fire stan