In [1]:
import numpy as np
import pandas as pd
import string

In [2]:
article_df = pd.read_csv(r"V:\Project & case studies\Text_Data_6\ArticlesApril2017.csv")

In [3]:
article_df.head()

Unnamed: 0,abstract,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,,58def1347c459f24986d7c80,716,By STEPHEN HILTNER and SUSAN LEHMAN,article,Finding an Expansive View of a Forgotten Peop...,"['Photography', 'New York Times', 'Niger', 'Fe...",3,Insider,2,2017-04-01 00:15:41,Unknown,One of the largest photo displays in Times his...,The New York Times,News,https://www.nytimes.com/2017/03/31/insider/nig...
1,,58def3237c459f24986d7c84,823,By GAIL COLLINS,article,"And Now, the Dreaded Trump Curse","['United States Politics and Government', 'Tru...",3,OpEd,23,2017-04-01 00:23:58,Unknown,Meet the gang from under the bus.,The New York Times,Op-Ed,https://www.nytimes.com/2017/03/31/opinion/and...
2,,58def9f57c459f24986d7c90,575,By THE EDITORIAL BOARD,article,Venezuela’s Descent Into Dictatorship,"['Venezuela', 'Politics and Government', 'Madu...",3,Editorial,22,2017-04-01 00:53:06,Unknown,A court ruling annulling the legislature’s aut...,The New York Times,Editorial,https://www.nytimes.com/2017/03/31/opinion/ven...
3,,58defd317c459f24986d7c95,1374,By MICHAEL POWELL,article,Stain Permeates Basketball Blue Blood,"['Basketball (College)', 'University of North ...",3,Sports,1,2017-04-01 01:06:52,College Basketball,"For two decades, until 2013, North Carolina en...",The New York Times,News,https://www.nytimes.com/2017/03/31/sports/ncaa...
4,,58df09b77c459f24986d7ca7,708,By DEB AMLEN,article,Taking Things for Granted,['Crossword Puzzles'],3,Games,0,2017-04-01 02:00:14,Unknown,In which Howard Barkin and Will Shortz teach u...,The New York Times,News,https://www.nytimes.com/2017/03/31/crosswords/...


In [4]:
all_headlines = []
all_headlines.extend(list(article_df.headline.values))

In [5]:
all_headlines[:3]

['Finding an Expansive View  of a Forgotten People in Niger',
 'And Now,  the Dreaded Trump Curse',
 'Venezuela’s Descent Into Dictatorship']

In [6]:
len(all_headlines)

886

In [7]:
all_headlines = [line for line in all_headlines if line != "Unknown"]

In [8]:
len(all_headlines)

831

In [9]:
def clean_text(txt):
  txt = "".join(t for t in txt if t not in string.punctuation).lower()
  txt = txt.encode('utf8').decode('ascii','ignore')
  return txt

In [10]:
corpus = [clean_text(x) for x in all_headlines]

In [11]:
print(corpus[:5])

['finding an expansive view  of a forgotten people in niger', 'and now  the dreaded trump curse', 'venezuelas descent into dictatorship', 'stain permeates basketball blue blood', 'taking things for granted']


In [12]:
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.sequence import pad_sequences
import keras.utils as ku
from keras.callbacks import EarlyStopping
import tensorflow as tf
tf.random.set_seed(2)
from numpy.random import seed
seed(1)




In [13]:
tokenizer = Tokenizer()

In [14]:
def get_sequence_of_tokens(corpus):
  tokenizer.fit_on_texts(corpus)
  total_words = len(tokenizer.word_index) + 1

  input_sequences = []

  for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
      n_gram_sequence = token_list[:i+1]
      input_sequences.append(n_gram_sequence)
  return input_sequences,total_words


In [15]:
inp_sequences,total_words = get_sequence_of_tokens(corpus)

In [16]:
print(inp_sequences[:10])

[[169, 17], [169, 17, 665], [169, 17, 665, 367], [169, 17, 665, 367, 4], [169, 17, 665, 367, 4, 2], [169, 17, 665, 367, 4, 2, 666], [169, 17, 665, 367, 4, 2, 666, 170], [169, 17, 665, 367, 4, 2, 666, 170, 5], [169, 17, 665, 367, 4, 2, 666, 170, 5, 667], [6, 80]]


In [17]:
print(total_words)

2422


In [18]:
def generate_padded_sequence(input_sequences):
  max_sequence_len = max([len(x) for x in input_sequences])
  input_sequences = np.array(pad_sequences(input_sequences,maxlen=max_sequence_len,padding='pre'))

  predictors,label = input_sequences[:,:-1],input_sequences[:,-1]

  label = ku.to_categorical(label,num_classes=total_words)

  return predictors,label,max_sequence_len

In [19]:
predictors,label,max_sequence_len = generate_padded_sequence(inp_sequences)

In [20]:
def create_model(max_sequence_len,total_words):
  input_len = max_sequence_len - 1
  model = Sequential()

  model.add(Embedding(total_words,10,input_length = input_len))

  model.add(LSTM(100))

  model.add(Dropout(0.1))

  model.add(Dense(total_words,activation='softmax'))

  model.compile(optimizer='adam',loss='categorical_crossentropy')

  return model

In [21]:
model = create_model(max_sequence_len,total_words)





In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 18, 10)            24220     
                                                                 
 lstm (LSTM)                 (None, 100)               44400     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 2422)              244622    
                                                                 
Total params: 313242 (1.19 MB)
Trainable params: 313242 (1.19 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [23]:
model.fit(predictors,label,epochs=10,verbose=1)

Epoch 1/10

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x250e933ff40>

In [24]:
def generate_text(seed_text,next_words,model,max_sequence_len):
  for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list],maxlen=max_sequence_len-1,padding='pre')
    predicted = model.predict(token_list,verbose=0)
    classes_x = np.argmax(predicted,axis=1)

    output_words = ""

    for word,index in tokenizer.word_index.items():
      if index == classes_x:
        output_word  = word
        break
    seed_text = seed_text + " " + output_word
  return seed_text.title()


In [25]:
generate_text("donald trump",3,model,max_sequence_len)

'Donald Trump The Americans Of'

In [26]:
generate_text("india and pakistan",3,model,max_sequence_len)

'India And Pakistan The Americans Of'

In [27]:
generate_text("science and technology",5,model,max_sequence_len)

'Science And Technology The Americans Of The Americans'