In [43]:
import pandas as pd

In [44]:
df = pd.read_csv("quotes.csv")
df.head()

Unnamed: 0,quote
0,Be yourself; everyone else is already taken.
1,You've gotta dance like there's nobody watching
2,Be the change that you wish to see in the world.
3,No one can make you feel inferior without your...
4,Live as if you were to die tomorrow. Learn as ...


In [45]:
df.shape

(2996, 1)

In [46]:
import re

def clean_text(text: str) -> str:
    """
      Removes punctuation and numbers from text.
      
      Args:
            text (str): Input text
            
      Returns:
            str: Cleaned text with only letters and spaces
    """

    if isinstance(text, str):
        text = re.sub(r'[^a-zA-Z\s]', '', text)  
        text = re.sub(r'\s+', ' ', text)         
        return text.lower().strip()          
    return text

df['quote'] = df['quote'].apply(clean_text)

In [47]:
df.head()

Unnamed: 0,quote
0,be yourself everyone else is already taken
1,youve gotta dance like theres nobody watching
2,be the change that you wish to see in the world
3,no one can make you feel inferior without your...
4,live as if you were to die tomorrow learn as i...


In [48]:
from tensorflow.keras.preprocessing.text import Tokenizer

max_vocab = 1500
tokenizer = Tokenizer(num_words=max_vocab)
tokenizer.fit_on_texts(df['quote'].tolist())

In [49]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

7567

In [50]:
quotes = tokenizer.texts_to_sequences(df['quote'].tolist())

In [51]:
df['quote'][0]

'be yourself everyone else is already taken'

In [52]:
quotes[0]

[13, 60, 216, 169, 5, 515, 711]

In [53]:
X = []
y = []

for seq in quotes:
      for i in range(1, len(seq)):
            x_input = seq[:i]
            y_output = seq[i]
            X.append(x_input)
            y.append(y_output)

In [54]:
sequence_length = max(len(x) for x in X)
sequence_length

252

In [55]:
from keras.preprocessing.sequence import pad_sequences

input_seq = pad_sequences(
      sequences=X,
      maxlen=sequence_length,
      padding='pre'
)

In [56]:
input_seq

array([[  0,   0,   0, ...,   0,   0,  13],
       [  0,   0,   0, ...,   0,  13,  60],
       [  0,   0,   0, ...,  13,  60, 216],
       ...,
       [  0,   0,   0, ...,  43, 964,  93],
       [  0,   0,   0, ..., 964,  93, 359],
       [  0,   0,   0, ...,  93, 359, 107]],
      shape=(70494, 252), dtype=int32)

In [57]:
import numpy as np
from keras.utils import to_categorical

y_out = np.array(y)
y_one_hot = to_categorical(
      y_out, num_classes=max_vocab
)

In [58]:
y_one_hot

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(70494, 1500))

In [59]:
from keras.models import Sequential
from keras.layers import Embedding, Dense, LSTM, GRU, Dropout

In [60]:
model = Sequential([
      Embedding(
            input_dim=max_vocab,
            output_dim=75,
            input_length=sequence_length
      ),
      GRU(
            units=75
      ),
      Dense(
            units=max_vocab,
            activation='softmax'
      )
])



In [61]:
model.summary()

In [62]:
model.compile(
      loss='categorical_crossentropy',
      optimizer='rmsprop',
      metrics=['accuracy']
)

In [None]:
model.fit(
      input_seq,
      y_one_hot,
      epochs=30,
      batch_size=128,
      validation_split=0.3
)

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

letter = "the"

# Convert text to sequence
seq = tokenizer.texts_to_sequences([letter])

# Pad sequence
inputs = pad_sequences(
    sequences=seq,
    maxlen=sequence_length,
    padding='pre'
)

# Predict
prediction = model.predict(inputs)
predicted_word_index = np.argmax(prediction, axis=1)

for word, index in tokenizer.word_index.items():
    if index == predicted_word_index:
        print(word)
        break

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 319ms/step
world
