In [1]:
# Import necessary libraries
import pandas as pd  # Data manipulation and analysis library
import numpy as np  # Library for numerical computations
import string, os  # Libraries for string and OS related operations
import warnings  # Library to handle warning messages
warnings.filterwarnings("ignore")  # Ignore all warning messages
warnings.simplefilter(action='ignore', category=FutureWarning)  # Ignore future warnings


In [2]:
# Import necessary modules from Keras
from keras_preprocessing.sequence import pad_sequences  # Function for padding sequences
from keras.layers import Embedding, LSTM, Dense, Dropout  # Layers to be used in building our model
from keras.preprocessing.text import Tokenizer  # Class for tokenizing text
from keras.callbacks import EarlyStopping  # Callback function to stop training when a monitored quantity has stopped improving
from keras.models import Sequential  # Linear stack of layers
import keras.utils as ku  # Utilities module

In [3]:
# set seeds for reproducability
import tensorflow
from numpy.random import seed
seed(2)
x=3
tensorflow.random.set_seed(x)

In [4]:
# Set the directory containing the data
curr_dir = 'archive/'
all_headlines = []  # List to store all headlines
for filename in os.listdir(curr_dir):  # Loop over every file in the directory
    if 'Articles' in filename:  # If the filename contains 'Articles'
        article_df = pd.read_csv(curr_dir + filename)  # Read the file as a dataframe
        all_headlines.extend(list(article_df.headline.values))  # Extend the 'all_headlines' list with the headlines from the dataframe
        
all_headlines = [line for line in all_headlines if line!= "Unknown"]  # Filter out any headlines that are 'Unknown'
print(all_headlines[:10])  # Print the first 10 headlines

['Finding an Expansive View  of a Forgotten People in Niger', 'And Now,  the Dreaded Trump Curse', 'Venezuela’s Descent Into Dictatorship', 'Stain Permeates Basketball Blue Blood', 'Taking Things for Granted', 'The Caged Beast Awakens', 'An Ever-Unfolding Story', 'O’Reilly Thrives as Settlements Add Up', 'Mouse Infestation', 'Divide in G.O.P. Now Threatens Trump Tax Plan']


In [5]:
len(all_headlines)

8603

In [6]:
# Function to clean the text
def clean_text(txt):
    txt = "".join(t for t in txt if t not in string.punctuation).lower()  # Remove punctuation and convert to lower case
    txt = txt.encode("utf8").decode("ascii",'ignore')  # Encode as UTF-8, decode to ASCII (ignoring errors)
    return txt
corpus = [clean_text(x) for x in all_headlines]  # Clean all headlines and store in 'corpus'
print(corpus[:10])  # Print the first 10 cleaned headlines

['finding an expansive view  of a forgotten people in niger', 'and now  the dreaded trump curse', 'venezuelas descent into dictatorship', 'stain permeates basketball blue blood', 'taking things for granted', 'the caged beast awakens', 'an everunfolding story', 'oreilly thrives as settlements add up', 'mouse infestation', 'divide in gop now threatens trump tax plan']


In [7]:
tokenizer = Tokenizer()
# Function to get a sequence of tokens
def get_sequence_of_tokens(corpus):
    tokenizer.fit_on_texts(corpus)  # Fit the tokenizer on the corpus
    print(tokenizer)
    total_words = len(tokenizer.word_index) + 1  # Get the total number of words
    
    # Convert data to a sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]  # Convert text to a sequence of tokens
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]  # Get n-gram sequence
            #print("n_gram_sequence",n_gram_sequence)
            input_sequences.append(n_gram_sequence)  # Append the sequence to 'input_sequences'
            print("input_sequences",input_sequences)
    return input_sequences, total_words
inp_sequences, total_words = get_sequence_of_tokens(corpus)  # Get sequences of tokens and total words
print(inp_sequences[:10])  # Print the first 10 sequences

<keras.preprocessing.text.Tokenizer object at 0x000002510DEAD370>
input_sequences [[391, 17]]
input_sequences [[391, 17], [391, 17, 5166]]
input_sequences [[391, 17], [391, 17, 5166], [391, 17, 5166, 523]]
input_sequences [[391, 17], [391, 17, 5166], [391, 17, 5166, 523], [391, 17, 5166, 523, 4]]
input_sequences [[391, 17], [391, 17, 5166], [391, 17, 5166, 523], [391, 17, 5166, 523, 4], [391, 17, 5166, 523, 4, 2]]
input_sequences [[391, 17], [391, 17, 5166], [391, 17, 5166, 523], [391, 17, 5166, 523, 4], [391, 17, 5166, 523, 4, 2], [391, 17, 5166, 523, 4, 2, 1601]]
input_sequences [[391, 17], [391, 17, 5166], [391, 17, 5166, 523], [391, 17, 5166, 523, 4], [391, 17, 5166, 523, 4, 2], [391, 17, 5166, 523, 4, 2, 1601], [391, 17, 5166, 523, 4, 2, 1601, 134]]
input_sequences [[391, 17], [391, 17, 5166], [391, 17, 5166, 523], [391, 17, 5166, 523, 4], [391, 17, 5166, 523, 4, 2], [391, 17, 5166, 523, 4, 2, 1601], [391, 17, 5166, 523, 4, 2, 1601, 134], [391, 17, 5166, 523, 4, 2, 1601, 134, 5]]


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




input_sequences [[391, 17], [391, 17, 5166], [391, 17, 5166, 523], [391, 17, 5166, 523, 4], [391, 17, 5166, 523, 4, 2], [391, 17, 5166, 523, 4, 2, 1601], [391, 17, 5166, 523, 4, 2, 1601, 134], [391, 17, 5166, 523, 4, 2, 1601, 134, 5], [391, 17, 5166, 523, 4, 2, 1601, 134, 5, 1951], [7, 57], [7, 57, 1], [7, 57, 1, 5167], [7, 57, 1, 5167, 10], [7, 57, 1, 5167, 10, 5168], [3366, 5169], [3366, 5169, 67], [3366, 5169, 67, 3367], [2484, 5170], [2484, 5170, 1952], [2484, 5170, 1952, 664], [2484, 5170, 1952, 664, 436], [259, 371], [259, 371, 6], [259, 371, 6, 2485], [1, 5171], [1, 5171, 2486], [1, 5171, 2486, 2487], [17, 5172], [17, 5172, 202], [1360, 3368], [1360, 3368, 19], [1360, 3368, 19, 3369], [1360, 3368, 19, 3369, 736], [1360, 3368, 19, 3369, 736, 39], [5173, 5174], [812, 5], [812, 5, 77], [812, 5, 77, 57], [812, 5, 77, 57, 1953], [812, 5, 77, 57, 1953, 10], [812, 5, 77, 57, 1953, 10, 135], [812, 5, 77, 57, 1953, 10, 135, 73], [125, 349], [125, 349, 331], [94, 33], [94, 33, 665], [94,

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



input_sequences [[391, 17], [391, 17, 5166], [391, 17, 5166, 523], [391, 17, 5166, 523, 4], [391, 17, 5166, 523, 4, 2], [391, 17, 5166, 523, 4, 2, 1601], [391, 17, 5166, 523, 4, 2, 1601, 134], [391, 17, 5166, 523, 4, 2, 1601, 134, 5], [391, 17, 5166, 523, 4, 2, 1601, 134, 5, 1951], [7, 57], [7, 57, 1], [7, 57, 1, 5167], [7, 57, 1, 5167, 10], [7, 57, 1, 5167, 10, 5168], [3366, 5169], [3366, 5169, 67], [3366, 5169, 67, 3367], [2484, 5170], [2484, 5170, 1952], [2484, 5170, 1952, 664], [2484, 5170, 1952, 664, 436], [259, 371], [259, 371, 6], [259, 371, 6, 2485], [1, 5171], [1, 5171, 2486], [1, 5171, 2486, 2487], [17, 5172], [17, 5172, 202], [1360, 3368], [1360, 3368, 19], [1360, 3368, 19, 3369], [1360, 3368, 19, 3369, 736], [1360, 3368, 19, 3369, 736, 39], [5173, 5174], [812, 5], [812, 5, 77], [812, 5, 77, 57], [812, 5, 77, 57, 1953], [812, 5, 77, 57, 1953, 10], [812, 5, 77, 57, 1953, 10, 135], [812, 5, 77, 57, 1953, 10, 135, 73], [125, 349], [125, 349, 331], [94, 33], [94, 33, 665], [94, 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[[391, 17], [391, 17, 5166], [391, 17, 5166, 523], [391, 17, 5166, 523, 4], [391, 17, 5166, 523, 4, 2], [391, 17, 5166, 523, 4, 2, 1601], [391, 17, 5166, 523, 4, 2, 1601, 134], [391, 17, 5166, 523, 4, 2, 1601, 134, 5], [391, 17, 5166, 523, 4, 2, 1601, 134, 5, 1951], [7, 57], [7, 57, 1], [7, 57, 1, 5167], [7, 57, 1, 5167, 10], [7, 57, 1, 5167, 10, 5168], [3366, 5169], [3366, 5169, 67], [3366, 5169, 67, 3367], [2484, 5170], [2484, 5170, 1952], [2484, 5170, 1952, 664], [2484, 5170, 1952, 664, 436], [259, 371], [259, 371, 6], [259, 371, 6, 2485], [1, 5171], [1, 5171, 2486], [1, 5171, 2486, 2487], [17, 5172], [17, 5172, 202], [1360, 3368], [1360, 3368, 19], [1360, 3368, 19, 3369], [1360, 3368, 19, 3369, 736], [1360, 3368, 19, 3369, 736, 39], [5173, 5174], [812, 5], [812, 5, 77], [812, 5, 77, 57], [812, 5, 77, 57, 1953], [812, 5, 77, 57, 1953, 10], [812, 5, 77, 57, 1953, 10, 135], [812, 5, 77, 57, 1953, 10, 135, 73], [125, 349], [125, 349, 331], [94, 33], [94, 33, 665], [94, 33, 665, 2], [94

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [8]:
# X                      Y
# ------                 -----                     
# Ali Ata                Bak
# Ali Ata Bak            Tamam
# Ali Ata Bak Tamam      mi
# Ali Ata Bak Tamam mi

#Flex

In [9]:
# Function to generate padded sequences
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])  # Get the length of the longest sequence
    
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))  # Pad all sequences
    X, label = input_sequences[:,:-1],input_sequences[:,-1]  # Get predictors and label
    label = ku.to_categorical(label, num_classes=total_words)  # Convert labels to categorical
    return X, label, max_sequence_len
X, label, max_sequence_len = generate_padded_sequences(inp_sequences)  # Get predictors, label, and max sequence length


In [10]:
# Function to create the model
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1  # Length of input to the model
    model = Sequential()  # Create a Sequential model
    model.add(Embedding(total_words, 10, input_length=input_len))  # Add an Embedding layer
    model.add(LSTM(100))  # Add an LSTM layer
    model.add(Dropout(0.1))  # Add a Dropout layer
    model.add(Dense(total_words, activation='softmax'))  # Add a Dense layer
    model.compile(loss='categorical_crossentropy', optimizer='adam')  # Compile the model
    return model
model = create_model(max_sequence_len, total_words)  # Create the model
model.summary()  # Print a summary of the model

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 23, 10)            112650    
                                                                 
 lstm (LSTM)                 (None, 100)               44400     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 11265)             1137765   
                                                                 
Total params: 1,294,815
Trainable params: 1,294,815
Non-trainable params: 0
_________________________________________________________________


In [55]:
model.fit(X, label, epochs=400)

Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400
Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 61/400
Epoch 62/400
Epoch 63/400
Epoch 64/400
Epoch 65/400
Epoch 66/400
Epoch 67/400
Epoch 68/400
Epoch 69/400
Epoch 70/400
Epoch 71/400
Epoch 72/400
Epoch 73/400
Epoch 74/400
Epoch 75/400
Epoch 76/400
Epoch 77/400
Epoch 78

Epoch 189/400
Epoch 190/400
Epoch 191/400
Epoch 192/400
Epoch 193/400
Epoch 194/400
Epoch 195/400
Epoch 196/400
Epoch 197/400
Epoch 198/400
Epoch 199/400
Epoch 200/400
Epoch 201/400
Epoch 202/400
Epoch 203/400
Epoch 204/400
Epoch 205/400
Epoch 206/400
Epoch 207/400
Epoch 208/400
Epoch 209/400
Epoch 210/400
Epoch 211/400
Epoch 212/400
Epoch 213/400
Epoch 214/400
Epoch 215/400
Epoch 216/400
Epoch 217/400
Epoch 218/400
Epoch 219/400
Epoch 220/400
Epoch 221/400
Epoch 222/400
Epoch 223/400
Epoch 224/400
Epoch 225/400
Epoch 226/400
Epoch 227/400
Epoch 228/400
Epoch 229/400
Epoch 230/400
Epoch 231/400
Epoch 232/400
Epoch 233/400
Epoch 234/400
Epoch 235/400
Epoch 236/400
Epoch 237/400
Epoch 238/400
Epoch 239/400
Epoch 240/400
Epoch 241/400
Epoch 242/400
Epoch 243/400
Epoch 244/400
Epoch 245/400
Epoch 246/400
Epoch 247/400
Epoch 248/400
Epoch 249/400
Epoch 250/400
Epoch 251/400
Epoch 252/400
Epoch 253/400
Epoch 254/400
Epoch 255/400
Epoch 256/400
Epoch 257/400
Epoch 258/400
Epoch 259/400
Epoch 

Epoch 374/400
Epoch 375/400
Epoch 376/400
Epoch 377/400
Epoch 378/400
Epoch 379/400
Epoch 380/400
Epoch 381/400
Epoch 382/400
Epoch 383/400
Epoch 384/400
Epoch 385/400
Epoch 386/400
Epoch 387/400
Epoch 388/400
Epoch 389/400
Epoch 390/400
Epoch 391/400
Epoch 392/400
Epoch 393/400
Epoch 394/400
Epoch 395/400
Epoch 396/400
Epoch 397/400
Epoch 398/400
Epoch 399/400
Epoch 400/400


<keras.callbacks.History at 0x18857d5b2e0>

In [11]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]# Convert seed text to a sequence of tokens
        #print(token_list)
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')# Pad the sequence
        #print(token_list)
        
        predict_x=model.predict(token_list,verbose=0) # Predict the next word
        predicted=np.argmax(predict_x,axis=1)
        #predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items(): # For each item in the tokenizer's word index
            if index == predicted: # If the index is the predicted word's index
                output_word = word  # Set output_word to the current word
                break
        seed_text += " "+output_word # Add the output word to the seed text
    return seed_text.title() # Return the title-cased seed text

In [57]:
#model.save_weights('Next Word Predictor_EPOCH400_v30.h5')

In [12]:
model.load_weights('Next Word Predictor_EPOCH400_v31.h5')

In [13]:
total_words

11265

In [14]:
tokenizer.word_index.items()



In [15]:
print (generate_text("united states", 3, model, max_sequence_len))
print (generate_text("preident trump", 4, model, max_sequence_len))
print (generate_text("donald trump", 4, model, max_sequence_len))
print (generate_text("india and china", 4, model, max_sequence_len))
print (generate_text("new york", 4, model, max_sequence_len))
print (generate_text("science and technology", 5, model, max_sequence_len))

United States Ever Include In
Preident Trump Is Saving Us Minnesota
Donald Trump Vs The Food Snobs
India And China Pick A New Year
New York Today A Noreaster Nears
Science And Technology And Love Season For A


In [16]:
tokenizer.texts_to_sequences(["states"])

[[245]]

In [17]:
find=1157
for index,x in enumerate(X):
    if find in x:
        #print(x)
        text=""
        for kelime in x:
            output_word=""
            for word,index in tokenizer.word_index.items(): # For each item in the tokenizer's word index
                if index == kelime: # If the index is the predicted word's index
                    output_word = word  # Set output_word to the current word
                    break
            text += " "+output_word # Add the output word to the seed text
        print(text)
            

            presidents supporters fear he will go too soft at state of union
                   the state of the union
                  the state of the union address
                 the state of the union address a
                the state of the union address a place
               the state of the union address a place for
              the state of the union address a place for student
             the state of the union address a place for student predictions
            the state of the union address a place for student predictions and
                    at state of union
                   at state of union mother
                  at state of union mother of
                 at state of union mother of ms13
                at state of union mother of ms13 victim
               at state of union mother of ms13 victim says
              at state of union mother of ms13 victim says shell
             at state of union mother of ms13 victim says shell seek
            at state 

In [63]:
model.save_weights('Next Word Predictor_EPOCH400_v31.h5')