In [1]:
import numpy as np
import pandas as pd
import tensorflow as ts

In [27]:
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense, LSTM

from string import punctuation

In [3]:
df = pd.read_csv('dataset/newyork/ArticlesApril2018.csv', encoding='latin')

In [4]:
df.head()

Unnamed: 0,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,5adf6684068401528a2aa69b,781,By JOHN BRANCH,article,Former N.F.L. Cheerleadersâ Settlement Offer...,"['Workplace Hazards and Violations', 'Football...",68,Sports,0,2018-04-24 17:16:49,Pro Football,"âI understand that they could meet with us, ...",The New York Times,News,https://www.nytimes.com/2018/04/24/sports/foot...
1,5adf653f068401528a2aa697,656,By LISA FRIEDMAN,article,E.P.A. to Unveil a New Rule. Its Effect: Less ...,"['Environmental Protection Agency', 'Pruitt, S...",68,Climate,0,2018-04-24 17:11:21,Unknown,The agency plans to publish a new regulation T...,The New York Times,News,https://www.nytimes.com/2018/04/24/climate/epa...
2,5adf4626068401528a2aa628,2427,By PETE WELLS,article,"The New Noma, Explained","['Restaurants', 'Noma (Copenhagen, Restaurant)...",66,Dining,0,2018-04-24 14:58:44,Unknown,Whatâs it like to eat at the second incarnat...,The New York Times,News,https://www.nytimes.com/2018/04/24/dining/noma...
3,5adf40d2068401528a2aa619,626,By JULIE HIRSCHFELD DAVIS and PETER BAKER,article,Unknown,"['Macron, Emmanuel (1977- )', 'Trump, Donald J...",68,Washington,0,2018-04-24 14:35:57,Europe,President Trump welcomed President Emmanuel Ma...,The New York Times,News,https://www.nytimes.com/2018/04/24/world/europ...
4,5adf3d64068401528a2aa60f,815,By IAN AUSTEN and DAN BILEFSKY,article,Unknown,"['Toronto, Ontario, Attack (April, 2018)', 'Mu...",68,Foreign,0,2018-04-24 14:21:21,Canada,"Alek Minassian, 25, a resident of Torontoâs ...",The New York Times,News,https://www.nytimes.com/2018/04/24/world/canad...


In [5]:
df.count()

articleID           1324
articleWordCount    1324
byline              1324
documentType        1324
headline            1324
keywords            1324
multimedia          1324
newDesk             1324
printPage           1324
pubDate             1324
sectionName         1324
snippet             1324
source              1324
typeOfMaterial      1324
webURL              1324
dtype: int64

In [6]:
df.columns

Index(['articleID', 'articleWordCount', 'byline', 'documentType', 'headline',
       'keywords', 'multimedia', 'newDesk', 'printPage', 'pubDate',
       'sectionName', 'snippet', 'source', 'typeOfMaterial', 'webURL'],
      dtype='object')

In [7]:
df['headline'].isnull().values.any()

False

In [8]:
headline = [title for title in df.headline.values]
headline[:5]

['Former N.F.L. Cheerleadersâ\x80\x99 Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained',
 'Unknown',
 'Unknown']

In [9]:
headline = [title for title in headline if title != 'Unknown']

In [10]:
headline

['Former N.F.L. Cheerleadersâ\x80\x99 Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained',
 'How a Bag of Texas Dirt  Became a Times Tradition',
 'Is School a Place for Self-Expression?',
 'Commuter Reprogramming',
 'Ford Changed Leaders, Looking for a Lift. Itâ\x80\x99s Still Looking.',
 'Romney Failed to Win at Utah Convention, But Few Believe Heâ\x80\x99s Doomed',
 'Chain Reaction',
 'He Forced the Vatican to Investigate Sex Abuse. Now Heâ\x80\x99s Meeting With Pope Francis.',
 'In Berlin, artists find a home',
 'The Right Stuff',
 'Jimmy Carter Knows What North Korea Wants',
 'The Truth Is Out There',
 'New Jersey Ruling Could Reignite Battle Over Church-State Separation',
 'Procrastinating',
 'Word + Quiz: dilatory',
 'My Life-Threatening Bout With E. Coli Food Poisoning',
 'Choosing Brexit, a Town Yearned for Its Seafaring Past, and Muddied Its Future',
 'A Quote Disproved',
 'Hot 

In [11]:
def repreprocessing(s):
    s = s.encode('utf8').decode('ascii', 'ignore')
    return ''.join(c for c in s if c not in punctuation).lower()

In [12]:
text = [repreprocessing(x) for x in headline]

In [13]:
text[:5]

['former nfl cheerleaders settlement offer 1 and a meeting with goodell',
 'epa to unveil a new rule its effect less science in policymaking',
 'the new noma explained',
 'how a bag of texas dirt  became a times tradition',
 'is school a place for selfexpression']

In [14]:
t = Tokenizer()

In [15]:
t.fit_on_texts(text)
vocab_size = len(t.word_index) + 1
vocab_size

3494

In [17]:
seqences = []
for line in text:
    encoded = t.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        seqence = encoded[:i+1]
        seqences.append(seqence)
        
seqences[:11]

[[99, 269],
 [99, 269, 371],
 [99, 269, 371, 1115],
 [99, 269, 371, 1115, 582],
 [99, 269, 371, 1115, 582, 52],
 [99, 269, 371, 1115, 582, 52, 7],
 [99, 269, 371, 1115, 582, 52, 7, 2],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10, 1116],
 [100, 3]]

In [18]:
index_to_word = {}
for key, value in t.word_index.items():
    index_to_word[value] = key
    
index_to_word[1], index_to_word[582]

('the', 'offer')

In [19]:
max_len = max(len(s) for s in seqences)
max_len

24

In [21]:
seqences = pad_sequences(seqences, max_len)

In [22]:
seqences[:5]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          99,  269],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   99,
         269,  371],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,   99,  269,
         371, 1115],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,   99,  269,  371,
        1115,  582],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,   99,  269,  371, 1115,
         582,   52]])

In [24]:
X = seqences[:, :-1]
Y = seqences[:, -1]
y = to_categorical(Y, vocab_size)

In [26]:
X.shape, y.shape

((7803, 23), (7803, 3494))

In [43]:
def call_model(embed_units , LSTM_units):
    model = Sequential([
        Embedding(vocab_size, embed_units, input_length=max_len-1, name='Embedding_Layer'),
        LSTM(LSTM_units, name='LSTM_Layer'),
        Dense(vocab_size, activation='softmax', name='Ouput_Layer')
    ])
    
    return model

In [44]:
model = call_model(10, 128)

In [45]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Embedding_Layer (Embedding)  (None, 23, 10)            34940     
_________________________________________________________________
LSTM_Layer (LSTM)            (None, 128)               71168     
_________________________________________________________________
Ouput_Layer (Dense)          (None, 3494)              450726    
Total params: 556,834
Trainable params: 556,834
Non-trainable params: 0
_________________________________________________________________


In [46]:
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [47]:

history = model.fit(X, y, epochs=200, verbose=2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/200
 - 6s - loss: 7.6507 - accuracy: 0.0264
Epoch 2/200
 - 5s - loss: 7.1109 - accuracy: 0.0291
Epoch 3/200
 - 5s - loss: 6.9747 - accuracy: 0.0322
Epoch 4/200
 - 5s - loss: 6.8460 - accuracy: 0.0399
Epoch 5/200
 - 5s - loss: 6.6947 - accuracy: 0.0432
Epoch 6/200
 - 5s - loss: 6.5209 - accuracy: 0.0470
Epoch 7/200
 - 5s - loss: 6.3330 - accuracy: 0.0522
Epoch 8/200
 - 5s - loss: 6.1308 - accuracy: 0.0587
Epoch 9/200
 - 5s - loss: 5.9324 - accuracy: 0.0638
Epoch 10/200
 - 5s - loss: 5.7468 - accuracy: 0.0650
Epoch 11/200
 - 5s - loss: 5.5718 - accuracy: 0.0725
Epoch 12/200
 - 5s - loss: 5.4060 - accuracy: 0.0777
Epoch 13/200
 - 5s - loss: 5.2483 - accuracy: 0.0852
Epoch 14/200
 - 5s - loss: 5.1009 - accuracy: 0.0889
Epoch 15/200
 - 5s - loss: 4.9560 - accuracy: 0.0957
Epoch 16/200
 - 5s - loss: 4.8193 - accuracy: 0.1119
Epoch 17/200
 - 5s - loss: 4.6848 - accuracy: 0.1221
Epoch 18/200
 - 5s - loss: 4.5539 - accuracy: 0.1357
Epoch 19/200
 - 5s - loss: 4.4302 - accuracy: 0.1511
Ep

In [56]:
def sentence_generation(model, t, current_word, n):
    word_init = current_word
    sentence = ''
    for _ in range(n):
        encoded = t.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], 23)
        
        result = model.predict_classes(encoded, verbose = 0)
        for word, index in t.word_index.items():
            if index == result:
                break
        current_word = current_word + ' ' + word
        
        sentence = sentence + ' ' + word
    sentence = word_init + sentence
    
    return sentence
                    

In [57]:
print(sentence_generation(model, t, 'i', 10))

i disapprove of school vouchers can i still apply for them


In [58]:
print(sentence_generation(model, t, 'how', 10))

how to make a crossword puzzle of franchise internal dissent of


In [59]:
print(sentence_generation(model, t, 'former', 10))

former nfl cheerleaders settlement offer 1 and a meeting with goodell
