In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,LSTM
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df=pd.read_csv("ArticlesApril2017.csv")
text=df['headline'].tolist()
corpus=text[:10]
print(corpus)

['Finding an Expansive View  of a Forgotten People in Niger', 'And Now,  the Dreaded Trump Curse', 'Venezuela’s Descent Into Dictatorship', 'Stain Permeates Basketball Blue Blood', 'Taking Things for Granted', 'The Caged Beast Awakens', 'An Ever-Unfolding Story', 'O’Reilly Thrives as Settlements Add Up', 'Mouse Infestation', 'Divide in G.O.P. Now Threatens Trump Tax Plan']


In [47]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

[[6, 1],
 [6, 1, 7],
 [6, 1, 7, 8],
 [6, 1, 7, 8, 9],
 [6, 1, 7, 8, 9, 10],
 [6, 1, 7, 8, 9, 10, 11],
 [6, 1, 7, 8, 9, 10, 11, 12],
 [6, 1, 7, 8, 9, 10, 11, 12, 2],
 [6, 1, 7, 8, 9, 10, 11, 12, 2, 13],
 [14, 3]]

In [48]:
def pader(inputs):
    length=max([len(i) for i in inputs])
    inp=pad_sequences(inputs,maxlen=length,padding="pre")
    X,y=inp[:,:-1],inp[:,-1]
    y=to_categorical(y,num_classes=total_words)
    return inp,length,y,X
inp,length,y,X=pader(inp_sequences)

In [49]:
model=Sequential([
    Embedding(total,10,input_length=length),
    LSTM(50),
    Dense(total,activation="softmax")
])
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [50]:
model.fit(X, y, epochs=5, verbose=1)
print(y)

Epoch 1/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 91ms/step - loss: 3.9317
Epoch 2/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - loss: 3.9298 
Epoch 3/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - loss: 3.9276
Epoch 4/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - loss: 3.9254
Epoch 5/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - loss: 3.9235
[[0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


In [54]:
model.fit(X,y, epochs=5, verbose=1, batch_size=16)

Epoch 1/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - loss: 3.8066
Epoch 2/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - loss: 3.8074
Epoch 3/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - loss: 3.7542
Epoch 4/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - loss: 3.7989
Epoch 5/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - loss: 3.7473


<keras.src.callbacks.history.History at 0x1e3469a64e0>

In [55]:
def output(seed,length):
    for i in range(length):
        tok=tokenizer.texts_to_sequences([seed])[0]
        tok=pad_sequences([tok],maxlen=4,padding="pre")
        pred=model.predict(tok)[0]
        index=np.random.choice(range(total),p=pred)
        data=tokenizer.index_word.get(index,"")
        seed+=" "+data
    return seed
print(output("Good Watch",4))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 444ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
Good Watch o dictatorship tax finding
