In [10]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import numpy as np 


import csv as csv

In [11]:
filename = "historic_articles.csv"
fields = [] 
rows = [] 
with open(filename, 'r') as csvfile: 
    # creating a csv reader object 
    csvreader = csv.reader(csvfile) 
      
    # extracting field names through first row 
    fields = next(csvreader) 
      
    # extracting each data row one by one 
    for row in csvreader: 
        rows.append(row) 
  
    # get total number of rows 
    print("Total no. of rows: %d"%(csvreader.line_num)) 
    
#rows
print(rows[0:5])

#fields
print(fields)

Total no. of rows: 1570
[['firstpost', 'india', 'https://www.firstpost.com/india/supreme-court-in-anti-caa-blockade-case-says-there-cant-be-universal-policy-on-right-to-protest-8837421.html', 'FP Staff', 'September 21, 2020 20:11:56 IST', "Supreme Court, in anti-CAA blockade case, says there can't be universal policy on right to protest - India News , Firstpost", 'While listening to a batch of petitions against the anti-CAA protests in December, the top court said that the right to protest has to be balanced against acts like blocking of roads and the situation may vary from case to case.', 'The Supreme Court on Monday said there cannot be a "universal policy" on right to protest and possible curbs as the situation may vary from case to case.The apex court made the observation while reserving its verdict\xa0on a batch of pleas against the anti-Citizenship Amendment Act (CAA) protests which led to blocking of a road in Shaheen Bagh in the National Capital last December.A bench consistin

In [12]:
type_to_article = {}
for row in rows:
    if row[1] not in type_to_article:
        type_to_article[row[1]] = row[5] + " \n " + row[6] + " \n " + row[7]        
    else:
        type_to_article[row[1]] += row[5] + " \n " + row[6] + " \n " + row[7]

In [13]:
tokenizer = Tokenizer() #instantiating the tokenizer
data = type_to_article['india']
corpus = data.lower().split("\n") #converting the sentence to lowercase
tokenizer.fit_on_texts(corpus) #creates tokens for each words 
total_words = len(tokenizer.word_index) + 1 #calculating total number of words in the initial sentence
print(total_words)

17200


In [14]:
input_sequences = [] #training features (x) will be a list

for line in corpus:
	token_list = tokenizer.texts_to_sequences([line])[0] #converts each sentence as its tokenized equivalent
	for i in range(1, len(token_list)):
		n_gram_sequence = token_list[:i+1] #generating n gram sequences
		input_sequences.append(n_gram_sequence) #appending each n gram sequence to the list of our features (xs)
print(input_sequences[:5])

[[328, 48], [328, 48, 5], [328, 48, 5, 387], [328, 48, 5, 387, 619], [328, 48, 5, 387, 619, 4254]]


In [16]:
max_sequence_len = max([len(x) for x in input_sequences]) #calculating the length of the longest sequence
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')) #pre-pading each value of the input_sequence
xs, labels = input_sequences[:,:-1],input_sequences[:,-1] #creating xs and their labels using numpy slicing
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words) #creating one hot encoding values

In [17]:
model = Sequential() #creating a sequential model
model.add(Embedding(total_words, 64, input_length=max_sequence_len-1)) #adding an embedding layer with 64 as the embedding dimension
model.add(Bidirectional(LSTM(20))) #adding 20 LSTM units
model.add(Dense(total_words, activation='softmax')) #creating a dense layer with 54 output units (total_words) with softmax activation

In [18]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #compiling the model with adam optimiser
history = model.fit(xs, ys, epochs=500, verbose=1) #training for 500 epochs

Epoch 1/500
Epoch 2/500

KeyboardInterrupt: 

In [None]:
#predicting the next word using an initial sentence
input_phrase = "hello"
next_words = 5
  
for _ in range(next_words):
	token_list = tokenizer.texts_to_sequences([input_phrase])[0] #converting our input_phrase to tokens and excluding the out of vcabulary words
	token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre') #padding the input_phrase
	predicted = model.predict_classes(token_list, verbose=0) #predicting the token of the next word using our trained model
	output_word = "" #initialising output word as blank at the beginning
	for word, index in tokenizer.word_index.items():
		if index == predicted:
			output_word = word #converting the token back to the corresponding word and storing it in the output_word
			break
	input_phrase += " " + output_word
print(input_phrase)