# RNN

In [1]:
import numpy as np
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
text = """곱다
경마장에 있는 말이 뛰고 있다\n
그의 말이 법이다.\n
가는 말이 고와야 오는 말이 \n"""

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
vocab_size = len(tokenizer.word_index) + 1
print('단어 집합의 크기 : %d' % vocab_size)

단어 집합의 크기 : 12


In [4]:
print(tokenizer.word_index)

{'말이': 1, '경마장에': 2, '있는': 3, '뛰고': 4, '있다': 5, '그의': 6, '법이다': 7, '가는': 8, '고와야': 9, '오는': 10, '곱다': 11}


In [5]:
sequences = list()
for line in text.split('\n'): # 줄 바꿈 문자를 기준으로 문장 토큰화
    encoded = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[: i+1]
        sequences.append(sequence)

print('학습에 사용할 샘플의 개수: %d' % len(sequences))

학습에 사용할 샘플의 개수: 11


In [6]:
print(sequences)

[[2, 3], [2, 3, 1], [2, 3, 1, 4], [2, 3, 1, 4, 5], [6, 1], [6, 1, 7], [8, 1], [8, 1, 9], [8, 1, 9, 10], [8, 1, 9, 10, 1], [8, 1, 9, 10, 1, 11]]


In [7]:
max_len = max(len(l) for l in sequences) # 모든 샘플에서 길이가 가장 긴 샘플의 길이 출력
print('샘플의 최대 길이 : {}'.format(max_len))

샘플의 최대 길이 : 6


In [8]:
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')

In [9]:
print(sequences)

[[ 0  0  0  0  2  3]
 [ 0  0  0  2  3  1]
 [ 0  0  2  3  1  4]
 [ 0  2  3  1  4  5]
 [ 0  0  0  0  6  1]
 [ 0  0  0  6  1  7]
 [ 0  0  0  0  8  1]
 [ 0  0  0  8  1  9]
 [ 0  0  8  1  9 10]
 [ 0  8  1  9 10  1]
 [ 8  1  9 10  1 11]]


In [10]:
sequences = np.array(sequences)
X = sequences[:, :-1]
y = sequences[:, -1]

In [11]:
print(X)

[[ 0  0  0  0  2]
 [ 0  0  0  2  3]
 [ 0  0  2  3  1]
 [ 0  2  3  1  4]
 [ 0  0  0  0  6]
 [ 0  0  0  6  1]
 [ 0  0  0  0  8]
 [ 0  0  0  8  1]
 [ 0  0  8  1  9]
 [ 0  8  1  9 10]
 [ 8  1  9 10  1]]


In [12]:
print(y)

[ 3  1  4  5  1  7  1  9 10  1 11]


In [13]:
y = to_categorical(y, num_classes=vocab_size)

In [14]:
print(y)

[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


## 2) 모델 설계하기
RNN 모델에 데이터를 훈련시키기

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN

In [16]:
embedding_dim = 10
hidden_units = 32

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(SimpleRNN(hidden_units))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=200, verbose=2)

Epoch 1/200
1/1 - 1s - loss: 2.4882 - accuracy: 0.1818 - 1s/epoch - 1s/step
Epoch 2/200
1/1 - 0s - loss: 2.4762 - accuracy: 0.1818 - 14ms/epoch - 14ms/step
Epoch 3/200
1/1 - 0s - loss: 2.4643 - accuracy: 0.2727 - 13ms/epoch - 13ms/step
Epoch 4/200
1/1 - 0s - loss: 2.4522 - accuracy: 0.1818 - 13ms/epoch - 13ms/step
Epoch 5/200
1/1 - 0s - loss: 2.4400 - accuracy: 0.2727 - 14ms/epoch - 14ms/step
Epoch 6/200
1/1 - 0s - loss: 2.4276 - accuracy: 0.3636 - 13ms/epoch - 13ms/step
Epoch 7/200
1/1 - 0s - loss: 2.4150 - accuracy: 0.2727 - 15ms/epoch - 15ms/step
Epoch 8/200
1/1 - 0s - loss: 2.4019 - accuracy: 0.2727 - 15ms/epoch - 15ms/step
Epoch 9/200
1/1 - 0s - loss: 2.3884 - accuracy: 0.2727 - 14ms/epoch - 14ms/step
Epoch 10/200
1/1 - 0s - loss: 2.3744 - accuracy: 0.3636 - 14ms/epoch - 14ms/step
Epoch 11/200
1/1 - 0s - loss: 2.3598 - accuracy: 0.3636 - 13ms/epoch - 13ms/step
Epoch 12/200
1/1 - 0s - loss: 2.3445 - accuracy: 0.3636 - 13ms/epoch - 13ms/step
Epoch 13/200
1/1 - 0s - loss: 2.3285 - ac

<keras.callbacks.History at 0x2bc8e4dd6d0>

In [17]:
def sentence_generation(model, tokenizer, current_word, n): # 모델, 토크나이저, 현재 단어, 반복할 횟수
    init_word = current_word
    sentence = ''

    # n번 반복
    for _ in range(n):
        # 현재 단어에 대한 정수 인코딩과 패딩
        encoded = tokenizer.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=5, padding='pre')
        # 입력한 X(현재 단어)에 대해서 Y를 예측하고 Y(예측한 단어)를 result에 저장.
        result = model.predict(encoded, verbose=0)
        result = np.argmax(result, axis=1)

        for word, index in tokenizer.word_index.items(): 
            # 만약 예측한 단어와 인덱스와 동일한 단어가 있다면 break
            if index == result:
                break

        # 현재 단어 + ' ' + 예측 단어를 현재 단어로 변경
        current_word = current_word + ' '  + word

        # 예측 단어를 문장에 저장
        sentence = sentence + ' ' + word

    sentence = init_word + sentence
    return sentence

In [18]:
print(sentence_generation(model, tokenizer, '경마장에', 4))

경마장에 있는 말이 뛰고 있다


In [19]:
print(sentence_generation(model, tokenizer, '가는', 5))

가는 말이 고와야 오는 말이 곱다


# LSTM을 이용하여 텍스트 생성하기

In [20]:
import numpy as np
import pandas as pd
import pandas as pad_sequences
from string import punctuation

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [21]:
df = pd.read_csv(r'C:/Users/GJAISCHOOL/Desktop/X_filter/Algorithm/dataset/ArticlesJan2018.csv')
df.head()

Unnamed: 0,abstract,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,,5a7101c110f40f00018be961,1322,By SHANNON SIMS,article,"Rhythm of the Streets: ‘We’re Warrior Women, a...","['Bahia (Brazil)', 'Music', 'Women and Girls',...",68,Travel,5,2018-01-30 23:37:31,Unknown,Meet the all-female Brazilian drum group that ...,The New York Times,News,https://www.nytimes.com/2018/01/30/travel/braz...
1,,5a70fc1210f40f00018be950,1308,By ALAN RAPPEPORT and THOMAS KAPLAN,article,"As Deficit Grows, Congress Keeps Spending","['United States Politics and Government', 'Fed...",68,Washington,17,2018-01-30 23:13:14,Politics,Treasury Secretary Steven Mnuchin urged Congre...,The New York Times,News,https://www.nytimes.com/2018/01/30/us/politics...
2,,5a70f8f810f40f00018be943,228,By JANE LANG,article,Lesson in Select Bus Service,"['Buses', 'Pennsylvania Station (Manhattan, NY...",0,Metro,16,2018-01-30 23:00:01,Unknown,A woman finds out what happens when you don’t ...,The New York Times,News,https://www.nytimes.com/2018/01/30/nyregion/me...
3,,5a70eb8110f40f00018be925,1114,By THE EDITORIAL BOARD,article,Here’s the Real State of the Union,"['State of the Union Message (US)', 'Trump, Do...",61,Editorial,24,2018-01-30 22:02:36,Editorials,The reaction against his authoritarian impulse...,The New York Times,Editorial,https://www.nytimes.com/2018/01/30/opinion/edi...
4,,5a70d1d210f40f00018be8d9,777,By BERT STRATTON,article,Good Riddance to Chief Wahoo,"['Baseball', 'Cleveland Indians', 'Western Res...",68,OpEd,0,2018-01-30 20:13:01,Unknown,"I’ve lived in Cleveland all my life, and I’m g...",The New York Times,Op-Ed,https://www.nytimes.com/2018/01/30/opinion/chi...


In [22]:
print('열의 개수 : ', len(df.columns))
print(df.columns)

열의 개수 :  16
Index(['abstract', 'articleID', 'articleWordCount', 'byline', 'documentType',
       'headline', 'keywords', 'multimedia', 'newDesk', 'printPage', 'pubDate',
       'sectionName', 'snippet', 'source', 'typeOfMaterial', 'webURL'],
      dtype='object')


In [23]:
print(df['headline'].isnull().values.any())

False


In [24]:
headline = []
# 헤드라인의 값들을 리스트로 저장
headline.extend(list(df.headline.values))
headline[:5]

['Rhythm of the Streets: ‘We’re Warrior Women, and Yes, We Can Play’',
 'As Deficit Grows, Congress Keeps Spending',
 'Lesson in Select Bus Service',
 'Here’s the Real State of the Union',
 'Good Riddance to Chief Wahoo']

In [25]:
print('총 샘플의 개수 : {}'.format(len(headline)))

총 샘플의 개수 : 905


In [26]:
headline = [word for word in headline if word != "Unkown"]
print('노이즈값 제거 후 샘플의 개수 : {}'.format(len(headline)))

노이즈값 제거 후 샘플의 개수 : 905


In [27]:
headline[:5]

['Rhythm of the Streets: ‘We’re Warrior Women, and Yes, We Can Play’',
 'As Deficit Grows, Congress Keeps Spending',
 'Lesson in Select Bus Service',
 'Here’s the Real State of the Union',
 'Good Riddance to Chief Wahoo']

In [28]:
def repreprocessing(raw_sentence):
    preproceseed_sentence = raw_sentence.encode("utf8").decode("ascii",'ignore')
    # 구두점 제거와 동시에 소문자화
    return ''.join(word for word in preproceseed_sentence if word not in punctuation).lower()

preprocessed_headline = [repreprocessing(x) for x in headline]
preprocessed_headline[:5]

['rhythm of the streets were warrior women and yes we can play',
 'as deficit grows congress keeps spending',
 'lesson in select bus service',
 'heres the real state of the union',
 'good riddance to chief wahoo']

In [29]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_headline)
vocab_size = len(tokenizer.word_index) + 1
print('단어 집합의 크기 : %d' % vocab_size)

단어 집합의 크기 : 2513


In [30]:
sequences = list()

for sentence in preprocessed_headline:
    # 각 샘플에 대한 정수 인코딩
    encoded = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)

sequences[:11]

[[754, 4],
 [754, 4, 1],
 [754, 4, 1, 229],
 [754, 4, 1, 229, 162],
 [754, 4, 1, 229, 162, 755],
 [754, 4, 1, 229, 162, 755, 85],
 [754, 4, 1, 229, 162, 755, 85, 7],
 [754, 4, 1, 229, 162, 755, 85, 7, 163],
 [754, 4, 1, 229, 162, 755, 85, 7, 163, 39],
 [754, 4, 1, 229, 162, 755, 85, 7, 163, 39, 49],
 [754, 4, 1, 229, 162, 755, 85, 7, 163, 39, 49, 86]]

In [31]:
index_to_word = {}
for key, value in tokenizer.word_index.items(): # 인덱스를 단어로 바꾸기 위해 index_to_word를 생성
    index_to_word[value] = key

print('빈도수 상위 582번 단어 : {}'.format(index_to_word[582]))

빈도수 상위 582번 단어 : drug


In [32]:
max_len = max(len(l) for l in sequences)
print('샘플의 최대 길이 : {}'.format(max_len))

샘플의 최대 길이 : 17


In [33]:
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')
print(sequences[:3])

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 754   4]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0 754   4   1]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0 754   4   1 229]]


In [34]:
sequences = np.array(sequences)
X = sequences[:,:-1]
y = sequences[:,-1]

In [35]:
print(X[: 3])

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 754]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0 754   4]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0 754   4   1]]


In [36]:
print(y[:3])

[  4   1 229]


In [37]:
y = to_categorical(y, num_classes=vocab_size)

In [38]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM


In [39]:
embedding_dim = 10
hidden_units = 128

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(LSTM(hidden_units))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=200, verbose=2)


Epoch 1/200
156/156 - 4s - loss: 7.4739 - accuracy: 0.0306 - 4s/epoch - 28ms/step
Epoch 2/200
156/156 - 1s - loss: 6.9622 - accuracy: 0.0347 - 996ms/epoch - 6ms/step
Epoch 3/200
156/156 - 1s - loss: 6.8366 - accuracy: 0.0298 - 1s/epoch - 6ms/step
Epoch 4/200
156/156 - 1s - loss: 6.7416 - accuracy: 0.0314 - 1s/epoch - 6ms/step
Epoch 5/200
156/156 - 1s - loss: 6.6402 - accuracy: 0.0472 - 1s/epoch - 8ms/step
Epoch 6/200
156/156 - 1s - loss: 6.5269 - accuracy: 0.0544 - 998ms/epoch - 6ms/step
Epoch 7/200
156/156 - 1s - loss: 6.3854 - accuracy: 0.0560 - 948ms/epoch - 6ms/step
Epoch 8/200
156/156 - 1s - loss: 6.2280 - accuracy: 0.0609 - 1s/epoch - 7ms/step
Epoch 9/200
156/156 - 1s - loss: 6.0564 - accuracy: 0.0708 - 967ms/epoch - 6ms/step
Epoch 10/200
156/156 - 1s - loss: 5.8942 - accuracy: 0.0671 - 1s/epoch - 7ms/step
Epoch 11/200
156/156 - 1s - loss: 5.7441 - accuracy: 0.0708 - 1s/epoch - 7ms/step
Epoch 12/200
156/156 - 1s - loss: 5.6041 - accuracy: 0.0736 - 1s/epoch - 6ms/step
Epoch 13/200

<keras.callbacks.History at 0x2bcfec36f10>

In [40]:
def sentence_generation(model, tokenizer, current_word, n): # 모델, 토크나이저, 현재 단어, 반복할 횟수
    init_word = current_word
    sentence = ''

    # n번 반복
    for _ in range(n):
        encoded = tokenizer.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=max_len-1, padding='pre')

        # 입력한 X(현재 단어)에 대해서 y를 예측하고 y(예측한 단어)를 result에 저장.
        result = model.predict(encoded, verbose=0)
        result = np.argmax(result, axis=1)

        for word, index in tokenizer.word_index.items(): 
            # 만약 예측한 단어와 인덱스와 동일한 단어가 있다면
            if index == result:
                break

        # 현재 단어 + ' ' + 예측 단어를 현재 단어로 변경
        current_word = current_word + ' '  + word

        # 예측 단어를 문장에 저장
        sentence = sentence + ' ' + word

    sentence = init_word + sentence
    return sentence


In [41]:
print(sentence_generation(model, tokenizer, 'i', 10))


i was there for the intervention in the coconut its 1


In [45]:
print(sentence_generation(model, tokenizer, 'how', 10))

how to speak grief is skeptical on trumps foundation and be
