In [65]:
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy
import tensorflow as tf 

import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import os
import keras
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional, GRU, Attention, LayerNormalization

from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
import math

warnings.filterwarnings('ignore')

os.environ['PYTHONHASHSEED'] = '1'
keras.utils.set_random_seed(42)
tf.random.set_seed(42)
#generate random weights for reproducibility
np.random.seed(42)
import random
random.seed(42)

# Data Analysis

In [66]:
with open ('train.csv', 'r') as f:
    data = f.read()
    data = data.split('\n')

In [67]:
lines = data
lines = [line for line in lines if line]

# Find the longest sentence based on the number of words
word_counts = [len(line.split()) for line in lines]
longest_sentence_index = word_counts.index(max(word_counts))
longest_sentence = lines[longest_sentence_index]
print(f"Longest Sentence: {longest_sentence}")
print(f"Number of Words in Longest Sentence: {max(word_counts)}\n\n")
most = max(word_counts)

# Find the most robust sentence based on word count
sentences_word_count = [len(sentence.split()) for sentence in lines]
most_robust_sentence_index = sentences_word_count.index(max(sentences_word_count))
most_robust_sentence = lines[most_robust_sentence_index]
print(f"Most Robust Sentence: {most_robust_sentence}")
print(f"Word Count of Most Robust Sentence: {max(sentences_word_count)}\n\n")

# Find the most robust sentence based on character count
sentences_character_count = [len(sentence) for sentence in lines]
most_robust_sentence_index = sentences_character_count.index(max(sentences_character_count))
most_robust_sentence = lines[most_robust_sentence_index]
print(f"Most Robust Sentence: {most_robust_sentence}")
print(f"Character Count of Most Robust Sentence: {max(sentences_character_count)}")


Longest Sentence: "Every act of love is a tribute, a tribute to the boundless power, the boundless power of the heart, to heal and transform, to connect and uplift, to create a legacy of warmth and affection."
Number of Words in Longest Sentence: 35


Most Robust Sentence: "Every act of love is a tribute, a tribute to the boundless power, the boundless power of the heart, to heal and transform, to connect and uplift, to create a legacy of warmth and affection."
Word Count of Most Robust Sentence: 35


Most Robust Sentence: "Every challenge overcome is a testament, a testament to your indomitable spirit, a living proof of your strength and resilience, a reminder that you are capable of conquering any obstacle that comes your way."
Character Count of Most Robust Sentence: 210


# Data Preprocessing

In [68]:
data.remove(data[0])

#remove the " as well as the commas and fullstops

for i in range(len(data)):
    data[i] = data[i].replace('"', '')
    data[i] = data[i].replace(',', '')
    data[i] = data[i].replace('.', '')
    data[i] = data[i].replace(';', '')


In [69]:
data

["Embrace the beauty of every sunrise it's a fresh chance to paint your world with joy",
 'Embrace challenges they are the stepping stones to your greatest victories',
 'Embrace the rhythm of life and let it dance through your soul',
 'Embrace kindness for it has the power to change the world one heart at a time',
 'Embrace the journey for it leads to the destination of your dreams',
 'Embrace your uniqueness for it is the fingerprint of your soul on the universe',
 'Embrace the present moment for it is the only one that truly exists',
 'Embrace your fears for they hold the key to unlocking your true potential',
 'Embrace gratitude and watch how it multiplies the blessings in your life',
 'Embrace the rain for it nourishes the seeds of your future success',
 'Embrace the whispers of your heart they carry the wisdom of the universe',
 'Embrace laughter for it is the music of a joyful heart',
 'Embrace the power of forgiveness for it sets you free from the chains of the past',
 'Embrace 

In [70]:
#split the quotes into a list of words

for i in range(len(data)):
    data[i] = data[i].split(' ')


## Generating more input output pairs

![rolling window](image.png)

We can use the following algorithm to split the data into input and output pairs in the format seen in the image.

In [71]:
#convert quotes into more data friendly format using rolling window

def convert_quotes(data):
    quotes = []
    for quote in data:
        for window_size in range(2, len(quote)):
            for i in range(len(quote)):
                quotes.append(quote[i:i+window_size])
    return quotes


quotes = convert_quotes(data)

In [72]:
#convert quotes into format suitable for tokenization

quotestoken = [' '.join(quote) for quote in quotes]

#tokenize quotes

tokenizer = Tokenizer()
tokenizer.fit_on_texts(quotestoken)
sequences = tokenizer.texts_to_sequences(quotestoken)
total_words = len(tokenizer.word_index) + 1

In [73]:
#now we pad the sequences so that they are all the same length
max_sequence_len = max([len(x) for x in sequences])

padded_sequences = pad_sequences(sequences, maxlen = max_sequence_len, padding = 'pre')

In [163]:
y.shape

(57804, 1085)

In [74]:
#split the data into X and ywe cacn take the last word as the y and the rest as the X

X = padded_sequences[:, :-1]
y = padded_sequences[:, -1]

#now we need to one hot encode the y values

y = to_categorical(y, num_classes=total_words)

## Modeling

As we are done with the data preprocessing, we can now move on to the modeling part. We will be using the following models:

1. **LSTM** (Long Short Term Memory)
2. **GRU** (Gated Recurrent Unit)
3. **BiLSTM** (Bidirectional LSTM)
4. **BiGRU** (Bidirectional GRU)
5. **Stacked RNN** (Ensemble of different RNN models)



In [75]:
print(most)

35


In [76]:
input_len = max_sequence_len - 1

model = Sequential()
model.add(Embedding(total_words, 10, input_length=input_len))

# Add LSTM layer with 256 units.
model.add(LSTM(256))

# Control overfitting.
model.add(Dropout(0.3))

# Add output layer
model.add(Dense(total_words, activation='softmax'))

In [77]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X, y, epochs=10, verbose=1)

model.summary()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_13 (Embedding)    (None, 33, 10)            10850     
                                                                 
 lstm_4 (LSTM)               (None, 256)               273408    
                                                                 
 dropout_11 (Dropout)        (None, 256)               0         
                                                                 
 dense_11 (Dense)            (None, 1085)              278845    
                                                                 
Total params: 563,103
Trainable params: 563,103
Non-trainable params: 0
_________________________________________________________________


In [78]:
def generate_text(model, tokenizer, max_sequence_len, seed_text, next_words=1): #def next_words for later use
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        
        predicted_probs = model.predict(token_list)[0]
        predicted_id = np.argmax(predicted_probs)
        
        predicted_word = tokenizer.index_word.get(predicted_id, "")
        seed_text += " " + predicted_word

    return seed_text



In [79]:
#use random seed text from the data

random.seed(42)
random_index = random.randint(0, len(quotestoken))
seed_text = quotestoken[random_index]


#generate text

generated_text = generate_text(model, tokenizer, max_sequence_len, seed_text)
print('Predicted Sentence/Quote:', generated_text)

Predicted Sentence/Quote: Jurong Lake Gardens' serenity Singapore's nature is


## Evaluation

Although we can use the model to predict words, we have no way of knowing how good the model is as there are no fixed right or wrong sentences in the English language. So, we will be using the following metrics to evaluate the model instead of basing our evaluation on the accuracy of the model. First thing we can consider is the **perplexity** of the model. Perplexity is a metric that is used to evaluate how well a probability model predicts a sample. 


Perplexity:
It is given by the following formula: 


![perplexity formula](image.png)

Where:
- \(N\) is the total number of words in the dataset.
- \(P(w_i)\) is the probability assigned by the language model to the \(i\)-th word in the dataset.

This is essentially just the exponential of the crossentropy (the loss) of the model. To make the process more efficient, i will apply it only at the end.

In [80]:
#Lets save the first model for comparison with other models later on

RNNdf = pd.DataFrame(columns=['Model Name', 'Generated Text', '"Loss"', '"Accuracy"', 'History'])

# Save the LSTM model information
model_name = 'LSTM'
loss_value = history.history['loss'][-1]
accuracy_value = history.history['accuracy'][-1]

# Create a dictionary with the new values
new_row = {
    'Model Name': model_name,
    'Generated Text': generated_text,
    '"Loss"': loss_value,
    '"Accuracy"': accuracy_value,
    'History': [history]
}

# Concatenate the DataFrames
RNNdf = pd.concat([RNNdf, pd.DataFrame([new_row])], ignore_index=True)

In [81]:
model = Sequential()
model.add(Embedding(total_words, 10, input_length=input_len))
model.add(GRU(256))
model.add(Dropout(0.3))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history_gru = model.fit(X, y, epochs=10, verbose=1)

# Generate text
generated_text_gru = generate_text(model, tokenizer, max_sequence_len, seed_text)

# Save GRU model information
model_name_gru = 'GRU'
loss_value_gru = np.min(history_gru.history['loss'])  # Use min loss as an example
accuracy_value_gru = np.max(history_gru.history['accuracy'])  # Use max accuracy as an example
# Append GRU information to the DataFrame
RNNdf = pd.concat([RNNdf, pd.DataFrame([{
    'Model Name': model_name_gru,
    'Generated Text': generated_text_gru,
    '"Loss"': loss_value_gru,
    '"Accuracy"': accuracy_value_gru,
    'History': [history_gru]
}])], ignore_index=True)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [82]:
#Try bidirectional LSTM

model = Sequential()
model.add(Embedding(total_words, 10, input_length=input_len))
model.add(Bidirectional(LSTM(256)))
model.add(Dropout(0.3))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history_bidirectional = model.fit(X, y, epochs=10, verbose=1)

# Generate text
generated_text_bidirectional = generate_text(model, tokenizer, max_sequence_len, seed_text)

# Save bidirectional LSTM model information
model_name_bidirectional = 'Bidirectional LSTM'
loss_value_bidirectional = np.min(history_bidirectional.history['loss'])  # Use min loss as an example
accuracy_value_bidirectional = np.max(history_bidirectional.history['accuracy'])  # Use max accuracy as an example

# concat to df
RNNdf = pd.concat([RNNdf, pd.DataFrame([{
    'Model Name': model_name_bidirectional,
    'Generated Text': generated_text_bidirectional,
    '"Loss"': loss_value_bidirectional,
    '"Accuracy"': accuracy_value_bidirectional,
    'History': [history_bidirectional]
}])], ignore_index=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [83]:
#Now we can try a bidirectional GRU

model = Sequential()
model.add(Embedding(total_words, 10, input_length=input_len))
model.add(Bidirectional(GRU(256)))
model.add(Dropout(0.3))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history_bidirectional_gru = model.fit(X, y, epochs=10, verbose=1)

# Generate text
generated_text_bidirectional_gru = generate_text(model, tokenizer, max_sequence_len, seed_text)


# Save bidirectional GRU model information
model_name_bidirectional_gru = 'Bidirectional GRU'
loss_value_bidirectional_gru = np.min(history_bidirectional_gru.history['loss'])  # Use min loss as an example
accuracy_value_bidirectional_gru = np.max(history_bidirectional_gru.history['accuracy'])  # Use max accuracy as an example

# concat to df
RNNdf = pd.concat([RNNdf, pd.DataFrame([{
    'Model Name': model_name_bidirectional_gru,
    'Generated Text': generated_text_bidirectional_gru,
    '"Loss"': loss_value_bidirectional_gru,
    '"Accuracy"': accuracy_value_bidirectional_gru,
    'History': [history_bidirectional_gru]
}])], ignore_index=True)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [84]:
import numpy as np


RNNdf['perplexity'] = RNNdf.apply(lambda row: [np.e**x for x in row['History'][0].history['loss']], axis=1)


In [85]:
RNNdf

Unnamed: 0,Model Name,Generated Text,"""Loss""","""Accuracy""",History,perplexity
0,LSTM,Jurong Lake Gardens' serenity Singapore's natu...,1.253504,0.684659,[<keras.callbacks.History object at 0x0000028D...,"[85.08637652848648, 25.579103563666436, 12.565..."
1,GRU,Jurong Lake Gardens' serenity Singapore's natu...,1.026306,0.737527,[<keras.callbacks.History object at 0x0000028D...,"[58.39597194494367, 11.665808092473222, 6.3488..."
2,Bidirectional LSTM,Jurong Lake Gardens' serenity Singapore's natu...,1.117934,0.716507,[<keras.callbacks.History object at 0x0000028F...,"[64.24561855023991, 15.48279495402413, 8.36094..."
3,Bidirectional GRU,Jurong Lake Gardens' serenity Singapore's natu...,0.982531,0.746488,[<keras.callbacks.History object at 0x0000028D...,"[44.89714549416552, 9.00277740825301, 5.260258..."


## Base Models/Starter Architechture 

From these "Base Models"/Simpler models, we can rougly see that the GRU models perform better compared to the LSTM models by comparing the Perplexity score (lower the better). To further improve the performance of the models, we can consider more complex model, using a different type of input data (with no duplicates) as well as trying adjusting the attention of the model. We can also consider tuning some of the parameters of the models as we have been using basic parameters for the models.

In [86]:
#remove any duplicates

undupequotes = list(set(tuple(quote) for quote in quotes))

In [87]:
#convert quotes into format suitable for tokenization

quotestoken2 = [' '.join(quote) for quote in undupequotes]

#tokenize quotes

tokenizer2 = Tokenizer()
tokenizer2.fit_on_texts(quotestoken2)
udsequences = tokenizer.texts_to_sequences(quotestoken2)
total_wordsud = len(tokenizer2.word_index) + 1

In [88]:
#now we pad the sequences so that they are all the same length
max_sequence_lenud = max([len(x) for x in udsequences])

padded_sequencesud = pad_sequences(udsequences, maxlen = max_sequence_lenud, padding = 'pre')

input_lenud = max_sequence_lenud - 1

In [89]:
#split the data into X and ywe cacn take the last word as the y and the rest as the X

Xud = padded_sequences[:, :-1]
yud = padded_sequences[:, -1]

#now we need to one hot encode the y values

yud = to_categorical(yud, num_classes=total_wordsud)

In [100]:
# We now have 2 datasets, one with duplicates and one without

# We can now write a function that receives a model, compiles it and fits it onto the two datasets
def fit_model(model, X, y, model_name, RNNdf, tokenizer, max_sequence_len, epochs=10, seed_text=''):
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Fit the model with duplicates
    history = model.fit(X, y, epochs= epochs, verbose=0)
    generated_text = generate_text(model, tokenizer, max_sequence_len, seed_text)
    generated_text = seed_text + ' ' + generated_text
    RNNdf = pd.concat([RNNdf, pd.DataFrame([{
        'Model Name': model_name + '',
        'Generated Text': generated_text,
        '"Loss"': np.min(history.history['loss']),
        '"Accuracy"': np.max(history.history['accuracy']),
        'History': [history]
    }])], ignore_index=True)

    return RNNdf


In [91]:
# construct a stacked GRU model
model = Sequential()
model.add(Embedding(total_words, 10, input_length=input_len))
model.add(Bidirectional(GRU(256, return_sequences=True)))
model.add(Bidirectional(GRU(256)))
model.add(Dropout(0.3))
model.add(Dense(total_words, activation='softmax'))

RNNdf = fit_model(model, X, y, 'Stacked BidiGRU (with duplicates)', RNNdf, tokenizer, max_sequence_len, epochs=10, seed_text=seed_text)

model.summary()

Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_17 (Embedding)    (None, 33, 10)            10850     
                                                                 
 bidirectional_16 (Bidirecti  (None, 33, 512)          411648    
 onal)                                                           
                                                                 
 bidirectional_17 (Bidirecti  (None, 512)              1182720   
 onal)                                                           
                                                                 
 dropout_15 (Dropout)        (None, 512)               0         
                                                                 
 dense_15 (Dense)            (None, 1085)              556605    
                                                                 
Total params: 2,161,823
Trainable params: 2,161,823
N

In [92]:
model = Sequential()
model.add(Embedding(total_wordsud, 10, input_length=input_lenud))
model.add(Bidirectional(GRU(256, return_sequences=True)))
model.add(Bidirectional(GRU(256)))
model.add(Dropout(0.3))
model.add(Dense(total_words, activation='softmax'))

RNNdf = fit_model(model, X, y, Xud, yud, 'Stacked BidiGRU (without duplicates)', RNNdf, tokenizer2, max_sequence_lenud, epochs=10, seed_text=seed_text)

model.summary()

Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_18 (Embedding)    (None, 33, 10)            10850     
                                                                 
 bidirectional_18 (Bidirecti  (None, 33, 512)          411648    
 onal)                                                           
                                                                 
 bidirectional_19 (Bidirecti  (None, 512)              1182720   
 onal)                                                           
                                                                 
 dropout_16 (Dropout)        (None, 512)               0         
                                                                 
 dense_16 (Dense)            (None, 1085)              556605    
                                                                 
Total params: 2,161,823
Trainable params: 2,161,823
N

In [97]:
RNNdf

## Evaluation

We can see that the stacked GRU model performs the best out of all the models. This is because the stacked GRU model is able to learn more complex patterns in the data compared to the other models. We can try to further increase the performance of the model by adding an attention layer to the model. An attention layer is a layer that is added to the model to help the model focus on the important parts of the input data. This will help the model to learn more complex patterns in the data and thus, increase the performance of the model.

In [160]:
model.summary()

Model: "model_18"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_39 (InputLayer)          [(None, 33)]         0           []                               
                                                                                                  
 embedding_61 (Embedding)       (None, 33, 100)      108500      ['input_39[0][0]']               
                                                                                                  
 bidirectional_87 (Bidirectiona  (None, 33, 512)     549888      ['embedding_61[0][0]']           
 l)                                                                                               
                                                                                                  
 bidirectional_88 (Bidirectiona  (None, 512)         1182720     ['bidirectional_87[0][0]']

In [161]:
# Construct a stacked Bidirectional GRU model with Attention
input_layer = keras.layers.Input(shape=(input_len,))
embedding_layer = Embedding(total_words, 100, input_length=input_len)(input_layer)
gru_layer = Bidirectional(GRU(256, return_sequences=True))(embedding_layer)
gru_layer2 = Bidirectional(GRU(256))(gru_layer)
attention_layer = Attention(use_scale = True)([gru_layer, gru_layer2])

# Add a dense layer
dense_layer = Dense(1085, activation='relu')(attention_layer)
dropout_layer = Dropout(0.3)(dense_layer)
output_layer = keras.layers.Reshape((1085))(dropout_layer)

# Construct the model
model = keras.models.Model(inputs=[input_layer], outputs=output_layer)

# Fit the model and update RNNdf
RNNdf = fit_model(model, X, y, 'Stacked BidiGRU with Attention Dups', RNNdf, tokenizer, max_sequence_len, epochs=10, seed_text=seed_text)

# Print model summary
model.summary()


TypeError: 'int' object is not iterable