In [34]:
import pandas as pd
from string import punctuation
import statsmodels.api as sm
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Embedding, Dense, LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [68]:
df=pd.read_csv('ArticlesMay2017.csv', encoding='utf8')
df.head()    

Unnamed: 0,abstract,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,,5906b3197c459f24986dd003,2412,By IAN JOHNSON,article,My Beijing: The Sacred City,"['Travel and Vacations', 'Beijing (China)', 'R...",3,Travel,1,2017-05-01 04:01:21,Unknown,"This metropolis was once a total work of art, ...",The New York Times,News,https://www.nytimes.com/2017/05/01/travel/beij...
1,,5906b3297c459f24986dd004,2318,By EMMA G. FITZSIMMONS,article,"6 Million Riders a Day, 1930s Technology","['Subways', 'Delays (Transportation)', 'Transi...",68,Metro,1,2017-05-01 04:01:33,Unknown,New York’s subway is struggling with old infra...,The New York Times,News,https://www.nytimes.com/2017/05/01/nyregion/ne...
2,,5906ceec7c459f24986dd021,1795,By MARC TRACY,article,Seeking a Cross-Border Conference,"['Cetys University', 'College Athletics', 'Nat...",3,Sports,1,2017-05-01 06:00:05,Unknown,Cetys University is making an ambitious bid to...,The New York Times,News,https://www.nytimes.com/2017/05/01/sports/mexi...
3,,5906cfa77c459f24986dd022,213,By SHANNON DOYNE,article,"Questions for: ‘Despite the “Yuck Factor,” Lee...",[],3,Learning,0,2017-05-01 06:03:03,Unknown,How are leeches used to treat various medical ...,The New York Times,News,https://www.nytimes.com/2017/05/01/learning/qu...
4,,5906e1c07c459f24986dd039,1342,By JASON STANLEY,article,Who Is a ‘Criminal’?,"['Illegal Immigration', 'Traffic and Parking V...",3,OpEd,0,2017-05-01 07:20:26,Unknown,Justice Roberts was right. The Trump administr...,The New York Times,Op-Ed,https://www.nytimes.com/2017/05/01/opinion/who...


In [129]:
a = df.iloc[:,[2,5]]

In [130]:
a

Unnamed: 0,articleWordCount,headline
0,2412,My Beijing: The Sacred City
1,2318,"6 Million Riders a Day, 1930s Technology"
2,1795,Seeking a Cross-Border Conference
3,213,"Questions for: ‘Despite the “Yuck Factor,” Lee..."
4,1342,Who Is a ‘Criminal’?
...,...,...
991,609,"Yes, You Can Write More Than One Letter in a S..."
992,797,Trump’s Act Of Gratuitous Destruction
993,945,"Petty Crime, Daunting Bail"
994,141,Unknown


In [69]:
print('열의 개수: ', len(df.columns))
print(df.columns)

열의 개수:  16
Index(['abstract', 'articleID', 'articleWordCount', 'byline', 'documentType',
       'headline', 'keywords', 'multimedia', 'newDesk', 'printPage', 'pubDate',
       'sectionName', 'snippet', 'source', 'typeOfMaterial', 'webURL'],
      dtype='object')


In [94]:
headline = [title for title in df.headline.values]
headline[:5]

['My Beijing: The Sacred City',
 '6 Million Riders a Day, 1930s Technology',
 'Seeking a Cross-Border Conference',
 'Questions for: ‘Despite the “Yuck Factor,” Leeches Are Big in Russian Medicine’',
 'Who Is a ‘Criminal’?']

In [96]:
df['headline'].isnull().values.any()

False

In [97]:
headline = [title for title in headline if title != 'Unknown']
len(headline)

935

In [11]:
len(headline)

996

In [98]:
df['headline'][1]

'6 Million Riders a Day, 1930s Technology'

In [16]:

import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['axes.unicode_minus'] = False
mpl.rc('font', family='Malgun Gothic')
%matplotlib inline

plt.rcParams["figure.figsize"] = (10,6)
plt.rcParams["font.size"] = 14

In [19]:
def repreprocessing(s):
    s=s.encode("utf8").decode("ascii", 'ignore')
    return ''.join(c for c in s if c not in punctuation).lower()

In [21]:
text = [repreprocessing(x) for x in headline]
text[:5]

['my beijing the sacred city',
 '6 million riders a day 1930s technology',
 'seeking a crossborder conference',
 'questions for despite the yuck factor leeches are big in russian medicine',
 'who is a criminal']

In [22]:
t = Tokenizer()
t.fit_on_texts(text)
vocab_size = len(t.word_index) + 1
print('단어 집합의 크기 : %d' % vocab_size)

단어 집합의 크기 : 2654


In [24]:
sequences = []
for line in text: 
    encoded = t.texts_to_sequences([line])[0] 
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)

sequences[:11]

[[49, 800],
 [49, 800, 1],
 [49, 800, 1, 801],
 [49, 800, 1, 801, 58],
 [115, 407],
 [115, 407, 408],
 [115, 407, 408, 2],
 [115, 407, 408, 2, 180],
 [115, 407, 408, 2, 180, 802],
 [115, 407, 408, 2, 180, 802, 803],
 [804, 2]]

In [27]:
max_len = max(len(s) for s in sequences)
print('샘플의 최대 길이 :', max_len)

샘플의 최대 길이 : 16


In [111]:
sequences = pad_sequences(sequences, max_len)
print(sequences[:3])

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0  49 800]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0  49 800   1]
 [  0   0   0   0   0   0   0   0   0   0   0   0  49 800   1 801]]


In [30]:
X = sequences[:,:-1]
y = sequences[:,-1]

In [31]:
y = to_categorical(y, num_classes=vocab_size)

In [32]:
X.shape, y.shape

((5501, 15), (5501, 2654))

In [35]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_len-1, 
                    name="Embedding_Layer"))
model.add(LSTM(128, name="LSTM_Layer"))
model.add(Dense(vocab_size, activation='softmax', name="Output_Layer"))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Embedding_Layer (Embedding)  (None, 15, 10)            26540     
_________________________________________________________________
LSTM_Layer (LSTM)            (None, 128)               71168     
_________________________________________________________________
Output_Layer (Dense)         (None, 2654)              342366    
Total params: 440,074
Trainable params: 440,074
Non-trainable params: 0
_________________________________________________________________


In [36]:
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', metrics=['accuracy'])

In [67]:
history = model.fit(X, y, epochs=300, verbose=2)

Epoch 1/300
 - 2s - loss: 0.3791 - accuracy: 0.8911
Epoch 2/300
 - 2s - loss: 0.3963 - accuracy: 0.8875
Epoch 3/300
 - 2s - loss: 0.3614 - accuracy: 0.8957
Epoch 4/300
 - 2s - loss: 0.3500 - accuracy: 0.8953
Epoch 5/300
 - 2s - loss: 0.3467 - accuracy: 0.8957
Epoch 6/300
 - 2s - loss: 0.3441 - accuracy: 0.8957
Epoch 7/300
 - 2s - loss: 0.3431 - accuracy: 0.8973
Epoch 8/300
 - 2s - loss: 0.3420 - accuracy: 0.8946
Epoch 9/300
 - 2s - loss: 0.3411 - accuracy: 0.8940
Epoch 10/300
 - 2s - loss: 0.3397 - accuracy: 0.8942
Epoch 11/300
 - 2s - loss: 0.3394 - accuracy: 0.8960
Epoch 12/300
 - 2s - loss: 0.3363 - accuracy: 0.8977
Epoch 13/300
 - 2s - loss: 0.3375 - accuracy: 0.8955
Epoch 14/300
 - 2s - loss: 0.3366 - accuracy: 0.8964
Epoch 15/300
 - 2s - loss: 0.3362 - accuracy: 0.8977
Epoch 16/300
 - 2s - loss: 0.3347 - accuracy: 0.8962
Epoch 17/300
 - 2s - loss: 0.3345 - accuracy: 0.8967
Epoch 18/300
 - 2s - loss: 0.3331 - accuracy: 0.8964
Epoch 19/300
 - 2s - loss: 0.3324 - accuracy: 0.8967
Ep

Epoch 155/300
 - 2s - loss: 0.3067 - accuracy: 0.8977
Epoch 156/300
 - 2s - loss: 0.3063 - accuracy: 0.8966
Epoch 157/300
 - 2s - loss: 0.3082 - accuracy: 0.8944
Epoch 158/300
 - 2s - loss: 0.3067 - accuracy: 0.8964
Epoch 159/300
 - 2s - loss: 0.3067 - accuracy: 0.8966
Epoch 160/300
 - 2s - loss: 0.3068 - accuracy: 0.8969
Epoch 161/300
 - 2s - loss: 0.3064 - accuracy: 0.8960
Epoch 162/300
 - 2s - loss: 0.3063 - accuracy: 0.8957
Epoch 163/300
 - 2s - loss: 0.3075 - accuracy: 0.8951
Epoch 164/300
 - 2s - loss: 0.3063 - accuracy: 0.8973
Epoch 165/300
 - 2s - loss: 0.3070 - accuracy: 0.8947
Epoch 166/300
 - 2s - loss: 0.3070 - accuracy: 0.8957
Epoch 167/300
 - 2s - loss: 0.3078 - accuracy: 0.8947
Epoch 168/300
 - 2s - loss: 0.3067 - accuracy: 0.8946
Epoch 169/300
 - 2s - loss: 0.3072 - accuracy: 0.8940
Epoch 170/300
 - 2s - loss: 0.3073 - accuracy: 0.8975
Epoch 171/300
 - 2s - loss: 0.3075 - accuracy: 0.8946
Epoch 172/300
 - 2s - loss: 0.3062 - accuracy: 0.8984
Epoch 173/300
 - 2s - loss: 

In [108]:
def sentence_generation(model, t, current_word, n):
    init_word = current_word 
    sentence = ''
    for _ in range(n):
        encoded = t.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], max_len-1) 
        result = model.predict_classes(encoded, verbose=0)
         
        for word, index in t.word_index.items(): 
            if index == result: 
                break 
        current_word = current_word + ' '  + word
        sentence = sentence + ' ' + word

    sentence = init_word + sentence
    return sentence

In [109]:
df['headline'][1]

'6 Million Riders a Day, 1930s Technology'

In [113]:
print(sentence_generation(model, t, '6 Million', 5))

6 Million riders a day 1930s technology


In [117]:
df['headline'][50]

'How Marching for Science Risks Politicizing It'

In [119]:
print(sentence_generation(model, t, 'How Marching', 5))

How Marching for science risks politicizing it


In [115]:
df['headline']

0                            My Beijing: The Sacred City
1               6 Million Riders a Day, 1930s Technology
2                      Seeking a Cross-Border Conference
3      Questions for: ‘Despite the “Yuck Factor,” Lee...
4                                   Who Is a ‘Criminal’?
                             ...                        
991    Yes, You Can Write More Than One Letter in a S...
992               Trump’s Act  Of Gratuitous Destruction
993                           Petty Crime, Daunting Bail
994                                              Unknown
995    Bucking Trump, These Cities, States and Compan...
Name: headline, Length: 996, dtype: object