In [1]:
import pandas as pd
from string import punctuation

In [2]:
punctuation # 특수문자 제거용

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from tensorflow.keras.utils import to_categorical

In [4]:
df=pd.read_csv('ArticlesApril2018.csv')
df.head(3)

Unnamed: 0,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,5adf6684068401528a2aa69b,781,By JOHN BRANCH,article,Former N.F.L. Cheerleaders’ Settlement Offer: ...,"['Workplace Hazards and Violations', 'Football...",68,Sports,0,2018-04-24 17:16:49,Pro Football,"“I understand that they could meet with us, pa...",The New York Times,News,https://www.nytimes.com/2018/04/24/sports/foot...
1,5adf653f068401528a2aa697,656,By LISA FRIEDMAN,article,E.P.A. to Unveil a New Rule. Its Effect: Less ...,"['Environmental Protection Agency', 'Pruitt, S...",68,Climate,0,2018-04-24 17:11:21,Unknown,The agency plans to publish a new regulation T...,The New York Times,News,https://www.nytimes.com/2018/04/24/climate/epa...
2,5adf4626068401528a2aa628,2427,By PETE WELLS,article,"The New Noma, Explained","['Restaurants', 'Noma (Copenhagen, Restaurant)...",66,Dining,0,2018-04-24 14:58:44,Unknown,What’s it like to eat at the second incarnatio...,The New York Times,News,https://www.nytimes.com/2018/04/24/dining/noma...


In [5]:
# 한눈에 보기 어려우므로
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1324 entries, 0 to 1323
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   articleID         1324 non-null   object
 1   articleWordCount  1324 non-null   int64 
 2   byline            1324 non-null   object
 3   documentType      1324 non-null   object
 4   headline          1324 non-null   object
 5   keywords          1324 non-null   object
 6   multimedia        1324 non-null   int64 
 7   newDesk           1324 non-null   object
 8   printPage         1324 non-null   int64 
 9   pubDate           1324 non-null   object
 10  sectionName       1324 non-null   object
 11  snippet           1324 non-null   object
 12  source            1324 non-null   object
 13  typeOfMaterial    1324 non-null   object
 14  webURL            1324 non-null   object
dtypes: int64(3), object(12)
memory usage: 155.3+ KB


In [6]:
# 사용할 열인 제목에 해당되는 headline 열에 Null값이 있는지 확인
df['headline'].isnull().values.any()

False

In [7]:
headline= [] # 리스트 선언
# append를 하면 리스트 안의 리스트 형태가 되지만,
headline.append(list(df.headline.values))
# headline[:5]

In [8]:
# extend를 하면 리스로 깔끔하게 정리됨
headline = []
headline.extend(list(df.headline.values))
headline[:5]

['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained',
 'Unknown',
 'Unknown']

In [9]:
print('총 샘플의 개수 : {}'.format(len(headline))) # 현재 샘플의 개수

총 샘플의 개수 : 1324


In [10]:
headline = [n for n in headline if n != "Unknown"] # Unknown 값을 가진 샘플 제거
print('노이즈값 제거 후 샘플의 개수 : {}'.format(len(headline))) # 제거 후 샘플의 개수

노이즈값 제거 후 샘플의 개수 : 1214


In [11]:
# 'Unknown' 제거 확인
headline[:5]

['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained',
 'How a Bag of Texas Dirt  Became a Times Tradition',
 'Is School a Place for Self-Expression?']

In [12]:
# 전처리. 구두점 제거와 단어의 소문자화
def repreprocessing(s):
    s = s.encode("utf8").decode("ascii", 'ignore')
    return ''.join(c for c in s if c not in punctuation).lower()
    # 구두점 제거와 소문자화

In [13]:
text = [repreprocessing(x) for x in headline]
text[:5]

['former nfl cheerleaders settlement offer 1 and a meeting with goodell',
 'epa to unveil a new rule its effect less science in policymaking',
 'the new noma explained',
 'how a bag of texas dirt  became a times tradition',
 'is school a place for selfexpression']

### 불용어 처리를 하면 안 되는 이유
### RNN은 인접한 단어간의 관계가 중요하기 때문!

In [14]:
t = Tokenizer()
t.fit_on_texts(text)
vocab_size = len(t.word_index) + 1
print('단어 집합의 크기 : %d' % vocab_size)

단어 집합의 크기 : 3494


In [15]:
# 정수 인코딩. 숫자화. 하나의 문장을 여러 줄로 분해하여 훈련 데이터 구성
sequences = list()

for line in text:
    encoded = t.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)

In [16]:
sequences[:11] # 11개의 샘플 출력

[[99, 269],
 [99, 269, 371],
 [99, 269, 371, 1115],
 [99, 269, 371, 1115, 582],
 [99, 269, 371, 1115, 582, 52],
 [99, 269, 371, 1115, 582, 52, 7],
 [99, 269, 371, 1115, 582, 52, 7, 2],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10, 1116],
 [100, 3]]

In [17]:
len(sequences)

7803

In [18]:
# 사전 만들기
index_to_word = {}
for key, value in t.word_index.items(): # 인덱스와 단어 바꾸기
    index_to_word[value] = key

In [19]:
print('빈도수 상위 582번 단어 : {}'.format(index_to_word[582]))

빈도수 상위 582번 단어 : offer


In [20]:
print('빈도수 상위 3번 단어 : {}'.format(index_to_word[3]))

빈도수 상위 3번 단어 : to


In [21]:
max_len = max(len(l) for l in sequences)
# 샘플의 최대 길이
max_len

24

In [22]:
# 최대 길이 24로 모든 샘플 패딩
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')

In [23]:
sequences[:3]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          99,  269],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   99,
         269,  371],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,   99,  269,
         371, 1115]], dtype=int32)

In [24]:
# feature와 label분리를 위해 numpy 이용
sequences = np.array(sequences)
X = sequences[:, :-1]
y = sequences[:, -1]

In [25]:
X[:3]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,  99],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,  99, 269],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,  99, 269, 371]], dtype=int32)

In [26]:
y[:3]

array([ 269,  371, 1115], dtype=int32)

In [27]:
# 레이블 원-핫 인코딩
y = to_categorical(y, num_classes=vocab_size)

In [28]:
y[:3]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [29]:
vocab_size

3494

In [30]:
max_len

24

### 모델 설계하기

In [31]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM

In [32]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_len-1))
# y데이터를 분리하였으므로 이제 X데이터의 길이는 기존 데이터의 길이 - 1
model.add(LSTM(128))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=200, verbose=2)

Epoch 1/200
244/244 - 5s - loss: 7.6415 - accuracy: 0.0273
Epoch 2/200
244/244 - 5s - loss: 7.1033 - accuracy: 0.0279
Epoch 3/200
244/244 - 5s - loss: 6.9708 - accuracy: 0.0340
Epoch 4/200
244/244 - 5s - loss: 6.8394 - accuracy: 0.0422
Epoch 5/200
244/244 - 5s - loss: 6.6825 - accuracy: 0.0464
Epoch 6/200
244/244 - 5s - loss: 6.4941 - accuracy: 0.0523
Epoch 7/200
244/244 - 5s - loss: 6.2905 - accuracy: 0.0568
Epoch 8/200
244/244 - 5s - loss: 6.0840 - accuracy: 0.0609
Epoch 9/200
244/244 - 5s - loss: 5.8818 - accuracy: 0.0639
Epoch 10/200
244/244 - 5s - loss: 5.6941 - accuracy: 0.0691
Epoch 11/200
244/244 - 5s - loss: 5.5129 - accuracy: 0.0732
Epoch 12/200
244/244 - 5s - loss: 5.3443 - accuracy: 0.0775
Epoch 13/200
244/244 - 5s - loss: 5.1851 - accuracy: 0.0848
Epoch 14/200
244/244 - 5s - loss: 5.0306 - accuracy: 0.0947
Epoch 15/200
244/244 - 5s - loss: 4.8851 - accuracy: 0.1029
Epoch 16/200
244/244 - 5s - loss: 4.7477 - accuracy: 0.1151
Epoch 17/200
244/244 - 5s - loss: 4.6147 - accura

<tensorflow.python.keras.callbacks.History at 0x7f15bc352c10>

In [33]:
def sentence_generation(model, t, current_word, n): # 모델, 토크나이저, 현재 단어, 반복할 횟수
    init_word = current_word # 처음 들어온 단어도 마지막에 같이 출력하기위해 저장
    sentence = ''
    for _ in range(n): # n번 반복
        encoded = t.texts_to_sequences([current_word])[0] # 현재 단어에 대한 정수 인코딩
        encoded = pad_sequences([encoded], maxlen=23, padding='pre') # 데이터에 대한 패딩
        result = model.predict_classes(encoded, verbose=0)
    # 입력한 X(현재 단어)에 대해서 y를 예측하고 y(예측한 단어)를 result에 저장.
        for word, index in t.word_index.items(): 
            if index == result: # 만약 예측한 단어와 인덱스와 동일한 단어가 있다면
                break # 해당 단어가 예측 단어이므로 break
        current_word = current_word + ' '  + word # 현재 단어 + ' ' + 예측 단어를 현재 단어로 변경
        sentence = sentence + ' ' + word # 예측 단어를 문장에 저장
    # for문이므로 이 행동을 다시 반복
    sentence = init_word + sentence
    return sentence

In [34]:
print(sentence_generation(model, t, 'i', 10))
# 임의의 단어 'i'에 대해서 10개의 단어를 추가 생성

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
i cant jump ship from facebook yet abuse better for smile


In [35]:
print(sentence_generation(model, t, 'how', 10))
# 임의의 단어 'how'에 대해서 10개의 단어를 추가 생성

how to make facebook more accountable i later on not ok
