### Importing the libraries

In [19]:
# keras module for building LSTM 
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

# set seeds for reproducability
import tensorflow as tf

from numpy.random import seed
tf.random.set_seed(2)
seed(1)

import pandas as pd
import numpy as np
import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

### Load the dataset

In [20]:
#Load the dataset of news headlines
curr_dir = './'
all_headlines = []
for filename in os.listdir(curr_dir):
    if 'Articles' in filename:
        article_df = pd.read_csv(curr_dir + filename)
        all_headlines.extend(list(article_df.headline.values))
        break

all_headlines[:10]

['N.F.L. vs. Politics Has Been Battle All Season Long',
 'Voice. Vice. Veracity.',
 'A Stand-Up’s Downward Slide',
 'New York Today: A Groundhog Has Her Day',
 'A Swimmer’s Communion With the Ocean',
 'Trail Activity',
 'Super Bowl',
 'Trump’s Mexican Shakedown',
 'Pence’s Presidential Pet',
 'Fruit of a Poison Tree']

### Dataset cleaning

In [21]:
#In dataset preparation step, we will first perform text cleaning of the data which includes removal of punctuations and lower casing all the words.

def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in all_headlines]
corpus[:10]

['nfl vs politics has been battle all season long',
 'voice vice veracity',
 'a standups downward slide',
 'new york today a groundhog has her day',
 'a swimmers communion with the ocean',
 'trail activity',
 'super bowl',
 'trumps mexican shakedown',
 'pences presidential pet',
 'fruit of a poison tree']

### Generating Sequence of N-gram Tokens

In [22]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

[[661, 118],
 [661, 118, 73],
 [661, 118, 73, 74],
 [661, 118, 73, 74, 662],
 [661, 118, 73, 74, 662, 663],
 [661, 118, 73, 74, 662, 663, 64],
 [661, 118, 73, 74, 662, 663, 64, 30],
 [661, 118, 73, 74, 662, 663, 64, 30, 211],
 [212, 664],
 [212, 664, 665]]

### Padding the Sequences and obtain Variables : Predictors and Target

In [23]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)


### LSTMs for Text Generation

In [24]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 16, 10)            22890     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dense_1 (Dense)              (None, 2289)              231189    
Total params: 298,479
Trainable params: 298,479
Non-trainable params: 0
_________________________________________________________________


### Lets train our model now

In [25]:
model.fit(predictors, label, epochs=100, verbose=2)

Epoch 1/100
142/142 - 1s - loss: 7.3648
Epoch 2/100
142/142 - 1s - loss: 6.8443
Epoch 3/100
142/142 - 1s - loss: 6.7230
Epoch 4/100
142/142 - 1s - loss: 6.6393
Epoch 5/100
142/142 - 1s - loss: 6.5619
Epoch 6/100
142/142 - 1s - loss: 6.4707
Epoch 7/100
142/142 - 1s - loss: 6.3624
Epoch 8/100
142/142 - 1s - loss: 6.2496
Epoch 9/100
142/142 - 1s - loss: 6.1420
Epoch 10/100
142/142 - 1s - loss: 6.0428
Epoch 11/100
142/142 - 1s - loss: 5.9480
Epoch 12/100
142/142 - 1s - loss: 5.8557
Epoch 13/100
142/142 - 1s - loss: 5.7696
Epoch 14/100
142/142 - 1s - loss: 5.6833
Epoch 15/100
142/142 - 1s - loss: 5.5977
Epoch 16/100
142/142 - 1s - loss: 5.5147
Epoch 17/100
142/142 - 1s - loss: 5.4273
Epoch 18/100
142/142 - 1s - loss: 5.3422
Epoch 19/100
142/142 - 1s - loss: 5.2579
Epoch 20/100
142/142 - 1s - loss: 5.1742
Epoch 21/100
142/142 - 1s - loss: 5.0928
Epoch 22/100
142/142 - 1s - loss: 5.0076
Epoch 23/100
142/142 - 1s - loss: 4.9259
Epoch 24/100
142/142 - 1s - loss: 4.8447
Epoch 25/100
142/142 - 1s

<tensorflow.python.keras.callbacks.History at 0x1e16fb50bc8>

### Generating the text

In [26]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

### Results

In [27]:
print (generate_text("united states", 5, model, max_sequence_len))
print (generate_text("president trump", 4, model, max_sequence_len))
print (generate_text("donald trump", 4, model, max_sequence_len))

United States More Know Obamacare And Affordable
President Trump Save A Tail Snobs
Donald Trump Middleschool President Turns A
