# Text Sequence Prediction

**Steps involved:**

1. Import necessary libaries

2. Load the dataset
3. Pre-processing: Cleaning
4. Tokenizing : N-grams
5. Padding: Make all sequence to a same length
6. Model building
    *   Embedding layer
    *   LSTM layer
    *   Dropout layer     
    *   Dense 
7. Model training
8. Prediction  of text



In [None]:
# keras module for building LSTM 
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
from keras.utils.np_utils import to_categorical

# set seeds for reproducability
from numpy.random import seed
seed(1)

import pandas as pd
import numpy as np
import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

**Step 1: Load the dataset**

**DATASET: NEWS HEADLINES**


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
curr_dir = '/content/drive/MyDrive/data/predict_text/'
all_headlines = []
for filename in os.listdir(curr_dir):
    if 'Articles' in filename:
        article_df = pd.read_csv(curr_dir + filename)
        all_headlines.extend(list(article_df.headline.values))
        break

all_headlines = [h for h in all_headlines if h != "Unknown"]
print("Total no. of headlines:",len(all_headlines))
print()
print("Sample texts:")
all_headlines[:10]

Total no. of headlines: 831

Sample texts:


['Finding an Expansive View  of a Forgotten People in Niger',
 'And Now,  the Dreaded Trump Curse',
 'Venezuela’s Descent Into Dictatorship',
 'Stain Permeates Basketball Blue Blood',
 'Taking Things for Granted',
 'The Caged Beast Awakens',
 'An Ever-Unfolding Story',
 'O’Reilly Thrives as Settlements Add Up',
 'Mouse Infestation',
 'Divide in G.O.P. Now Threatens Trump Tax Plan']

**Step 2: Pre-processing**

In [None]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in all_headlines]
print("Before cleaning:",all_headlines[1],"\nAfter cleaning:",corpus[1],"\n")
print()
corpus[:10]


Before cleaning: And Now,  the Dreaded Trump Curse 
After cleaning: and now  the dreaded trump curse 




['finding an expansive view  of a forgotten people in niger',
 'and now  the dreaded trump curse',
 'venezuelas descent into dictatorship',
 'stain permeates basketball blue blood',
 'taking things for granted',
 'the caged beast awakens',
 'an everunfolding story',
 'oreilly thrives as settlements add up',
 'mouse infestation',
 'divide in gop now threatens trump tax plan']

**Step 3 : Tokenizing**

In [None]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    print("Total number of words:",total_words)
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:9] #finding an expansive view  of a forgotten people in niger

Total number of words: 2422


[[169, 17],
 [169, 17, 665],
 [169, 17, 665, 367],
 [169, 17, 665, 367, 4],
 [169, 17, 665, 367, 4, 2],
 [169, 17, 665, 367, 4, 2, 666],
 [169, 17, 665, 367, 4, 2, 666, 170],
 [169, 17, 665, 367, 4, 2, 666, 170, 5],
 [169, 17, 665, 367, 4, 2, 666, 170, 5, 667]]


Headline: finding an expansive view  of a forgotten people in niger

```
Ngram	                                    Sequence of Tokens
finding an	                               [169, 17]
finding an expansive	                     [169, 17, 665]
finding an expansive view                    [169, 17, 665, 367]
finding an expansive view  of	            [169, 17, 665, 367, 4]
finding an expansive view  of a              [169, 17, 665, 367, 4, 2]
finding an expansive view  of a forgotten    [169, 17, 665, 367, 4, 2, 666]
```

```
a      --> 2
.
.
an     --> 17
fnding --> 169
.
.
.
.......-->2422
```




**Step 4: Padding**

In [None]:
import sys
np.set_printoptions(threshold=sys.maxsize)
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

print(predictors[:10])
print(max_sequence_len)

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 169]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 169  17]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 169  17 665]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0 169  17 665 367]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0 169  17 665 367   4]
 [  0   0   0   0   0   0   0   0   0   0   0   0 169  17 665 367   4   2]
 [  0   0   0   0   0   0   0   0   0   0   0 169  17 665 367   4   2 666]
 [  0   0   0   0   0   0   0   0   0   0 169  17 665 367   4   2 666 170]
 [  0   0   0   0   0   0   0   0   0 169  17 665 367   4   2 666 170   5]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   6]]
19


**Step 5: Model Creation**

In [None]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 18, 10)            24220     
_________________________________________________________________
lstm (LSTM)                  (None, 100)               44400     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 2422)              244622    
Total params: 313,242
Trainable params: 313,242
Non-trainable params: 0
_________________________________________________________________


**Step 6: Model training**

In [None]:
model.fit(predictors, label, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f51c53422d0>

**Step 7 : Prediction**

In [None]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        print(predicted)
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [None]:
print (generate_text("united states", 1, model, max_sequence_len))
print (generate_text("united states", 7, model, max_sequence_len))

[30]
United States York
[30]
[3]
[1]
[193]
[4]
[1]
[193]
United States York To The Americans Of The Americans


# Number Sequence Prediction

In [None]:
import numpy as np
def splitSequence(seq, n_steps):
    
    #Declare X and y as empty list
    X = []
    y = []
    
    for i in range(len(seq)):
        #get the last index
        lastIndex = i + n_steps
        
        #if lastIndex is greater than length of sequence then break
        if lastIndex > len(seq) - 1:
            break
            
        #Create input and output sequence
        seq_X, seq_y = seq[i:lastIndex], seq[lastIndex]
        
        #append seq_X, seq_y in X and y list
        X.append(seq_X)
        y.append(seq_y)
        pass
    #Convert X and y into numpy array
    X = np.array(X)
    y = np.array(y)
    
    return X,y 
    
    pass

**Data Preparation**

In [None]:
data = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200]

In [None]:
n_steps = 5
X, y = splitSequence(data, n_steps = 5)
for i in range(len(X)):
    print(X[i], y[i])

[10 20 30 40 50] 60
[20 30 40 50 60] 70
[30 40 50 60 70] 80
[40 50 60 70 80] 90
[50 60 70 80 90] 100
[ 60  70  80  90 100] 110
[ 70  80  90 100 110] 120
[ 80  90 100 110 120] 130
[ 90 100 110 120 130] 140
[100 110 120 130 140] 150
[110 120 130 140 150] 160
[120 130 140 150 160] 170
[130 140 150 160 170] 180
[140 150 160 170 180] 190
[150 160 170 180 190] 200


In [None]:
# reshape from [samples, timesteps] into [samples, timesteps, features]
n_features = 1
X = X.reshape((X.shape[0], X.shape[1], n_features))
print(X[:2])

[[[10]
  [20]
  [30]
  [40]
  [50]]

 [[20]
  [30]
  [40]
  [50]
  [60]]]


**Model Building**

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
model = tf.keras.Sequential()
model.add(layers.LSTM(50, activation='relu', input_shape=(n_steps, n_features)))
model.add(layers.Dense(1))

model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 50)                10400     
_________________________________________________________________
dense (Dense)                (None, 1)                 51        
Total params: 10,451
Trainable params: 10,451
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(0.01), loss=tf.keras.losses.MeanSquaredError(), metrics=['accuracy'])

In [None]:
model.fit(X, y, epochs=20)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fe68d0d7c10>

In [None]:
test_data = np.array([90, 100, 110, 120, 130])
test_data = test_data.reshape((1, n_steps, n_features))
test_data

array([[[ 90],
        [100],
        [110],
        [120],
        [130]]])

**Output prediction**

In [None]:
predictNextNumber = model.predict(test_data, verbose=1)
print(predictNextNumber)

[[140.49368]]
