**Dataset Preparation:**

In [69]:
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
import random

data = pd.read_csv('/content/drive/MyDrive/nndl/PoetryFoundationData.csv')

In [70]:
print(data.describe())

         Unnamed: 0
count  13854.000000
mean      93.204417
std       57.493544
min        0.000000
25%       42.000000
50%       92.000000
75%      142.000000
max      199.000000


In [71]:
print(data.head())

   Unnamed: 0                                              Title  \
0           0  \r\r\n                    Objects Used to Prop...   
1           1  \r\r\n                    The New Church\r\r\n...   
2           2  \r\r\n                    Look for Me\r\r\n   ...   
3           3  \r\r\n                    Wild Life\r\r\n     ...   
4           4  \r\r\n                    Umbrella\r\r\n      ...   

                                                Poem              Poet Tags  
0  \r\r\nDog bone, stapler,\r\r\ncribbage board, ...  Michelle Menting  NaN  
1  \r\r\nThe old cupola glinted above the clouds,...     Lucia Cherciu  NaN  
2  \r\r\nLook for me under the hood\r\r\nof that ...        Ted Kooser  NaN  
3  \r\r\nBehind the silo, the Mother Rabbit\r\r\n...   Grace Cavalieri  NaN  
4  \r\r\nWhen I push your button\r\r\nyou fly off...      Connie Wanek  NaN  


In [72]:
print(data.shape)

(13854, 5)


In [73]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13854 entries, 0 to 13853
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  13854 non-null  int64 
 1   Title       13854 non-null  object
 2   Poem        13854 non-null  object
 3   Poet        13854 non-null  object
 4   Tags        12899 non-null  object
dtypes: int64(1), object(4)
memory usage: 541.3+ KB
None


**Data Preprocessing:**

In [74]:
corpus = "\n".join(data['Poet'].values)

In [75]:
corpus = corpus.lower()
corpus = re.sub(r'[^\w\s]', '', corpus)

In [76]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([corpus])
total_words = len(tokenizer.word_index) + 1

# Convert text into sequences of integers
input_sequences = []
corpus_words = corpus.split()
for i in range(5, len(corpus_words)):
    sequence = corpus_words[i-5:i+1]
    tokenized_seq = tokenizer.texts_to_sequences([" ".join(sequence)])[0]
    input_sequences.append(tokenized_seq)

# Pad sequences
max_sequence_len = 5  # length of each sequence
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len + 1)

In [77]:
X, y = input_sequences[:, :-1], input_sequences[:, -1]
X, y = X[:10000], y[:10000]
y = np.array(y)

**LSTM Model Development:**

In [78]:
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=100, input_length=max_sequence_len))
model.add(LSTM(100, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(total_words, activation='softmax'))



**Training:**

In [79]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [80]:
model.fit(X, y, epochs=100, batch_size =128, verbose=1)

Epoch 1/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.0078 - loss: 8.1284
Epoch 2/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.0087 - loss: 7.2124
Epoch 3/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.0118 - loss: 7.0694
Epoch 4/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.0127 - loss: 6.9906
Epoch 5/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.0173 - loss: 6.8901
Epoch 6/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.0150 - loss: 6.7589
Epoch 7/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.0216 - loss: 6.6016
Epoch 8/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.0200 - loss: 6.4311
Epoch 9/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7f784b1bc310>

**Text Generation:**

In [81]:
def generate_poetry(seed_text, next_words=5000):
    generated_words = set()
    poem = seed_text

    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([poem])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len, padding='pre')

        predicted_probs = model.predict(token_list, verbose=0)
        predicted = np.argmax(predicted_probs, axis=-1)

        next_word = tokenizer.index_word.get(predicted[0], None)
        if next_word is None or next_word in generated_words:
            continue

        generated_words.add(next_word)
        poem += " " + next_word

    return poem

print(generate_poetry("The Morning Sun Shine", next_words=500))


The Morning Sun Shine kazim talamantez brolaski julian


**Evaluation and Experimentation**

In [82]:
print(generate_poetry("The Morning Sun Shine", next_words=50))

The Morning Sun Shine kazim talamantez brolaski julian
