# LSTM 이용하여 텍스트 생성하기

#### 1) 데이터에 대한 이해와 전처리

In [1]:
import numpy as np
import pandas as pd
from string import punctuation

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [2]:
df = pd.read_csv('ArticlesApril2018.csv')
df.head()

Unnamed: 0,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,5adf6684068401528a2aa69b,781,By JOHN BRANCH,article,Former N.F.L. Cheerleaders’ Settlement Offer: ...,"['Workplace Hazards and Violations', 'Football...",68,Sports,0,2018-04-24 17:16:49,Pro Football,"“I understand that they could meet with us, pa...",The New York Times,News,https://www.nytimes.com/2018/04/24/sports/foot...
1,5adf653f068401528a2aa697,656,By LISA FRIEDMAN,article,E.P.A. to Unveil a New Rule. Its Effect: Less ...,"['Environmental Protection Agency', 'Pruitt, S...",68,Climate,0,2018-04-24 17:11:21,Unknown,The agency plans to publish a new regulation T...,The New York Times,News,https://www.nytimes.com/2018/04/24/climate/epa...
2,5adf4626068401528a2aa628,2427,By PETE WELLS,article,"The New Noma, Explained","['Restaurants', 'Noma (Copenhagen, Restaurant)...",66,Dining,0,2018-04-24 14:58:44,Unknown,What’s it like to eat at the second incarnatio...,The New York Times,News,https://www.nytimes.com/2018/04/24/dining/noma...
3,5adf40d2068401528a2aa619,626,By JULIE HIRSCHFELD DAVIS and PETER BAKER,article,Unknown,"['Macron, Emmanuel (1977- )', 'Trump, Donald J...",68,Washington,0,2018-04-24 14:35:57,Europe,President Trump welcomed President Emmanuel Ma...,The New York Times,News,https://www.nytimes.com/2018/04/24/world/europ...
4,5adf3d64068401528a2aa60f,815,By IAN AUSTEN and DAN BILEFSKY,article,Unknown,"['Toronto, Ontario, Attack (April, 2018)', 'Mu...",68,Foreign,0,2018-04-24 14:21:21,Canada,"Alek Minassian, 25, a resident of Toronto’s Ri...",The New York Times,News,https://www.nytimes.com/2018/04/24/world/canad...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1324 entries, 0 to 1323
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   articleID         1324 non-null   object
 1   articleWordCount  1324 non-null   int64 
 2   byline            1324 non-null   object
 3   documentType      1324 non-null   object
 4   headline          1324 non-null   object
 5   keywords          1324 non-null   object
 6   multimedia        1324 non-null   int64 
 7   newDesk           1324 non-null   object
 8   printPage         1324 non-null   int64 
 9   pubDate           1324 non-null   object
 10  sectionName       1324 non-null   object
 11  snippet           1324 non-null   object
 12  source            1324 non-null   object
 13  typeOfMaterial    1324 non-null   object
 14  webURL            1324 non-null   object
dtypes: int64(3), object(12)
memory usage: 155.3+ KB


In [4]:
headline= []
headline.extend(list(df['headline'].values))
headline[:5]

['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained',
 'Unknown',
 'Unknown']

In [5]:
headline = [word for word in headline if word!='Unknown']
print('노이즈 제거 후 개수 :',len(headline))

노이즈 제거 후 개수 : 1214


In [6]:
# 구두점 제거 및 소문자화
def repreprocessing(sentence):
    preprocessed_sentence = sentence.encode('utf8').decode('ascii', 'ignore')
    return ''.join(word for word in preprocessed_sentence if word not in punctuation).lower()


In [7]:
preprocessed_headline = [repreprocessing(x) for x in headline]
preprocessed_headline[:5]

['former nfl cheerleaders settlement offer 1 and a meeting with goodell',
 'epa to unveil a new rule its effect less science in policymaking',
 'the new noma explained',
 'how a bag of texas dirt  became a times tradition',
 'is school a place for selfexpression']

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_headline)
vocab_size = len(tokenizer.word_index)+1
print("단어 집합 크기 : ", vocab_size)

단어 집합 크기 :  3494


In [9]:
sequences = list()

for sentence in preprocessed_headline:
    # 각 샘플에 대해 정수 인코딩
    encoded = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)

In [10]:
sequences[:11]

[[99, 269],
 [99, 269, 371],
 [99, 269, 371, 1115],
 [99, 269, 371, 1115, 582],
 [99, 269, 371, 1115, 582, 52],
 [99, 269, 371, 1115, 582, 52, 7],
 [99, 269, 371, 1115, 582, 52, 7, 2],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10, 1116],
 [100, 3]]

하나의 문장을 단어 하나씩 추가하는 이유
- 하나의 단어를 예측하기 위해 이전에 등장한 단어들을 모두 참고하는 것
- 즉 각 시점(time step)마다 하나씩 추가적으로 등장하는 형태
- 마지막에 예측할 단어에 해당되는 레이블을 분리하는 작업을 수행
    - index_to_word 활용

In [11]:
index_to_word = {}
for key, value in tokenizer.word_index.items():
    index_to_word[value]=key

In [12]:
print("빈도수 상위 582번 단어 : {}".format(index_to_word[582]))

빈도수 상위 582번 단어 : offer


In [13]:
max_len = max(len(l) for l in sequences)
print("샘플 최대 길이 :",max_len)

샘플 최대 길이 : 24


In [14]:
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')

In [15]:
sequences = np.array(sequences)
X = sequences[:,:-1]
y = sequences[:, -1]

In [16]:
y = to_categorical(y, num_classes=vocab_size)

#### 2.) 모델 설계하기

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM

# import tensorflow as tf
# config = tf.compat.v1.ConfigProto() 
# config.gpu_options.allow_growth = True


model = Sequential()
model.add(Embedding(vocab_size, 10))
model.add(LSTM(128))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y , epochs=200, verbose=2)

Epoch 1/200
244/244 - 4s - loss: 7.6440 - accuracy: 0.0247 - 4s/epoch - 17ms/step
Epoch 2/200
244/244 - 2s - loss: 7.1193 - accuracy: 0.0286 - 2s/epoch - 6ms/step
Epoch 3/200
244/244 - 2s - loss: 6.9830 - accuracy: 0.0350 - 2s/epoch - 6ms/step
Epoch 4/200
244/244 - 2s - loss: 6.8540 - accuracy: 0.0408 - 2s/epoch - 7ms/step
Epoch 5/200
244/244 - 2s - loss: 6.6995 - accuracy: 0.0469 - 2s/epoch - 7ms/step
Epoch 6/200
244/244 - 2s - loss: 6.5329 - accuracy: 0.0502 - 2s/epoch - 6ms/step
Epoch 7/200
244/244 - 2s - loss: 6.3542 - accuracy: 0.0500 - 2s/epoch - 7ms/step
Epoch 8/200
244/244 - 2s - loss: 6.1628 - accuracy: 0.0566 - 2s/epoch - 6ms/step
Epoch 9/200
244/244 - 2s - loss: 5.9640 - accuracy: 0.0627 - 2s/epoch - 6ms/step
Epoch 10/200
244/244 - 2s - loss: 5.7752 - accuracy: 0.0689 - 2s/epoch - 7ms/step
Epoch 11/200
244/244 - 2s - loss: 5.5961 - accuracy: 0.0729 - 2s/epoch - 6ms/step
Epoch 12/200
244/244 - 2s - loss: 5.4264 - accuracy: 0.0755 - 2s/epoch - 6ms/step
Epoch 13/200
244/244 - 1

<keras.callbacks.History at 0x1be22b1b580>

In [20]:
def sentence_generation(model, tokenizer, current_word, n):
    init_word = current_word
    sentence = ''
    
    for _ in range(n):
        encoded = tokenizer.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=max_len-1, padding='pre')
        
        result = model.predict(encoded, verbose=0)
        result = np.argmax(result, axis=1)
        
        for word, index in tokenizer.word_index.items():
            if index==result:
                break
        
        current_word = current_word+' '+word
        
        sentence = sentence + ' ' +word
    sentence = init_word + sentence
    return sentence

In [21]:
print(sentence_generation(model, tokenizer, 'i', 10))


i disapprove of school vouchers can i still apply for them


In [22]:
print(sentence_generation(model, tokenizer, 'Game', 10))


Game your baby is your roommate workers the political firm nightmare


In [25]:
print(sentence_generation(model, tokenizer, 'how', 50))


how to make facebook more accountable rules the wrestling mommy and and more abuse party girls was it whats girls of was was admissions reimaginedjurassic classics reimaginedjurassic classics reimaginedjurassic classics reimaginedjurassic reimaginedjurassic fight reimaginedjurassic classics reimaginedjurassic reimaginedjurassic reimaginedjurassic given my do the absurdist office fire for a cathedral todo list todo
