# Quote Writer Using LSTM

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM

## Load the Data

In [2]:
df = pd.read_csv('Quotes Dataset.csv')
df.head()

Unnamed: 0,Number,Quote,Author
0,1,The only thing we have to fear is fear itself.,Franklin D. Roosevelt
1,2,The truth will set you free.,The Bible
2,3,To be yourself in a world that is constantly t...,Ralph Waldo Emerson
3,4,"Success is not final, failure is not fatal: It...",Winston S. Churchill
4,5,The only way to do great work is to love what ...,Steve Jobs


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 725 entries, 0 to 724
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Number  725 non-null    int64 
 1   Quote   725 non-null    object
 2   Author  725 non-null    object
dtypes: int64(1), object(2)
memory usage: 17.1+ KB


## EDA

In [4]:
df.columns

Index(['Number', 'Quote', 'Author'], dtype='object')

In [5]:
df.drop(['Number'], axis=1, inplace=True)

In [6]:
df.columns

Index(['Quote', 'Author'], dtype='object')

In [7]:
df['Author'].value_counts()

Author
Albert Einstein           98
Buddha                    51
Confucius                 50
Thomas Edison             50
Wayne Gretzky             50
Theodore Roosevelt        50
Nelson Mandela            50
Lao Tzu                   50
Abraham Lincoln           49
Zig Ziglar                49
Jimmy Dean                49
Walt Disney               49
Pablo Picasso             49
Eleanor Roosevelt          2
Robert Frost               2
Steve Jobs                 2
Helen Keller               2
Franklin D. Roosevelt      2
E.E. Cummings              1
James Baldwin              1
Sam Levenson               1
Ralph Waldo Emerson        1
Winston S. Churchill       1
Dalai Lama                 1
Albert Schweitzer          1
George Eliot               1
Mark Twain                 1
Frank Sinatra              1
John Lennon                1
The Bible                  1
Booker T. Washington       1
Mahatma Gandhi             1
Martin Luther King Jr.     1
Norman Vincent Peale       1
Joseph 

In [8]:
df.isnull().sum()

Quote     0
Author    0
dtype: int64

In [9]:
max(df['Quote'].str.len())


125

In [10]:
max_len = max(df['Quote'].str.len())
max_len

125

In [11]:
quotes = df['Quote']

In [12]:
quotes

0         The only thing we have to fear is fear itself.
1                           The truth will set you free.
2      To be yourself in a world that is constantly t...
3      Success is not final, failure is not fatal: It...
4      The only way to do great work is to love what ...
                             ...                        
720            Believe you can and you're halfway there.
721    The mind is everything. What you think you bec...
722    I have not failed. I've just found 10,000 ways...
723    A journey of a thousand miles begins with a si...
724          It always seems impossible until it's done.
Name: Quote, Length: 725, dtype: object

In [13]:
tokenizer = Tokenizer()

In [14]:
tokenizer.fit_on_texts(quotes)


In [15]:
tokenizer.word_index

{'you': 1,
 'the': 2,
 'to': 3,
 'it': 4,
 'of': 5,
 'a': 6,
 'can': 7,
 'is': 8,
 'be': 9,
 'have': 10,
 'and': 11,
 'but': 12,
 'i': 13,
 'not': 14,
 'everything': 15,
 'always': 16,
 "can't": 17,
 'my': 18,
 'do': 19,
 'that': 20,
 'what': 21,
 'we': 22,
 'only': 23,
 'life': 24,
 'success': 25,
 'yourself': 26,
 'else': 27,
 'work': 28,
 "don't": 29,
 'if': 30,
 'change': 31,
 'your': 32,
 'as': 33,
 "you're": 34,
 'believe': 35,
 "i've": 36,
 'on': 37,
 "it's": 38,
 '10': 39,
 'until': 40,
 'with': 41,
 'one': 42,
 'mind': 43,
 'become': 44,
 'impossible': 45,
 'thing': 46,
 'making': 47,
 'miss': 48,
 '100': 49,
 'shots': 50,
 'take': 51,
 'halfway': 52,
 'there': 53,
 'why': 54,
 'failed': 55,
 'just': 56,
 'found': 57,
 '000': 58,
 'ways': 59,
 "won't": 60,
 'journey': 61,
 'thousand': 62,
 'miles': 63,
 'begins': 64,
 'step': 65,
 'think': 66,
 'seems': 67,
 'done': 68,
 'dream': 69,
 'are': 70,
 'keep': 71,
 'direction': 72,
 'wind': 73,
 'adjust': 74,
 'sails': 75,
 'reach':

In [16]:
tok_dict = tokenizer.word_index
tok_len = len(tokenizer.word_index)
tok_len


249

In [17]:
tokized_sequnece = tokenizer.texts_to_sequences(quotes)

In [18]:
tokized_sequnece[:2]

[[2, 23, 46, 22, 10, 3, 115, 8, 115, 116], [2, 144, 106, 145, 1, 146]]

In [19]:
input_seq = []

for sentence in quotes.str.split('\n'):
#     print(sentence)
    tokenized_sen = tokenizer.texts_to_sequences(sentence)[0]
    
#   below code create sub_sequence
# Example with tokenized_sen = [1, 2, 3]:
# Iteration 1: tokenized_sen[:2] → [1, 2]
# Iteration 2: tokenized_sen[:3] → [1, 2, 3]
    for i in range(1,len(tokenized_sen)):
        input_seq.append(tokenized_sen[:i+1])


In [20]:
len(input_seq)


7707

In [21]:
max_len = max([len(x) for x in input_seq])
max_len

24

In [22]:
x_padded = pad_sequences(input_seq,maxlen=max_len,padding='pre')
x_padded[:2]

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  2, 23],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  2, 23, 46]])

In [23]:
x = x_padded[:,:-1]
y = x_padded[:,-1]

In [24]:
x.shape,y.shape


((7707, 23), (7707,))

In [25]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y, num_classes=tok_len+1)
y[:2]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [26]:
model = Sequential()
model.add(Embedding(input_dim=250,output_dim=100,input_shape=(23,)))
model.add(LSTM(150,return_sequences=True))
model.add(LSTM(150,return_sequences=True))
model.add(LSTM(128))
# model.add(BatchNormalization())
model.add(Dense(250,activation='softmax'))

In [28]:
model.compile(loss = 'categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [27]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 23, 100)           25000     
                                                                 
 lstm (LSTM)                 (None, 23, 150)           150600    
                                                                 
 lstm_1 (LSTM)               (None, 23, 150)           180600    
                                                                 
 lstm_2 (LSTM)               (None, 128)               142848    
                                                                 
 dense (Dense)               (None, 250)               32250     
                                                                 
Total params: 531298 (2.03 MB)
Trainable params: 531298 (2.03 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [29]:
model.fit(x,y,epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x1365e5e3b20>

In [30]:
model.save('quote_writer.h5')

  saving_api.save_model(


In [41]:
import pickle

with open("new_tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [35]:
import tensorflow as tf
saved_model = tf.keras.models.load_model("quote_writer.h5")

saved_tokenizer = pickle.load(open("tokenizer_of_quotes.pkl","rb"))

In [40]:
user_text = "Believe"

for i in range(5):
    text_token = saved_tokenizer.texts_to_sequences([user_text])[0]
    print(text_token)
    input_x = pad_sequences([text_token],maxlen=23,padding='pre')
    predictions = saved_model.predict(input_x)
    pos=np.argmax(predictions)
    print(pos)
    for word, index in saved_tokenizer.word_index.items():
        if index == pos:
            user_text = user_text+' '+word
            print(user_text)
        

[35]
1
Believe you
[35, 1]
7
Believe you can
[35, 1, 7]
11
Believe you can and
[35, 1, 7, 11]
34
Believe you can and you're
[35, 1, 7, 11, 34]
52
Believe you can and you're halfway
