In [43]:
import pandas as pd
import numpy as np

import re
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


import torch
from torch.utils.data import TensorDataset, DataLoader, random_split
import torch.nn as nn
import torch.optim as optim

In [44]:
df =pd.read_csv('PoemDataset.csv')
df.head()

Unnamed: 0,Title,Poem,Poet,Genre
0,Search,Wandered tonight through a cityas ruined as a ...,Hester Knibbe,Fear
1,A Poem for the Cruel Majority,The cruel majority emerges!Hail to the cruel m...,Jerome Rothenberg,Anger
2,“Do Not Embrace Your Mind’s New Negro Friend”,Do not embrace your mind’s new negro friendOr ...,William Meredith,Anger
3,The Greatest Love,She is sixty. She livesthe greatest love of he...,Anna Swir,Love
4,Bilingual/Bilingüe,"My father liked them separate, one there,one h...",Rhina P. Espaillat,Anger


In [45]:
#  Poet is not important to have it in dataset
df = df.drop(columns=['Poet'])
df.head(20) 

Unnamed: 0,Title,Poem,Genre
0,Search,Wandered tonight through a cityas ruined as a ...,Fear
1,A Poem for the Cruel Majority,The cruel majority emerges!Hail to the cruel m...,Anger
2,“Do Not Embrace Your Mind’s New Negro Friend”,Do not embrace your mind’s new negro friendOr ...,Anger
3,The Greatest Love,She is sixty. She livesthe greatest love of he...,Love
4,Bilingual/Bilingüe,"My father liked them separate, one there,one h...",Anger
5,The War Films,"O living pictures of the dead, O songs without...",Fear
6,Of the Education of Children,"I was planted, a cropnot a boy but with headlo...",Sadness
7,"from America, America","God save America, My home, sweet...",Joy
8,Poisonous Plants of America,April foolBear's-footBog-onionDevil's-appleDog...,Fear
9,Morte d'Arthur,So all day long the noise of battle roll'd Amo...,Fear


In [46]:
# what I can see, in title column.  I will define function to clean the text
def clean(text):
    if pd.isnull(text):
        return ''
    # In dataset, there are some chines characters, I will remove them, but  I noticed that in dataset there are non English words as well , which have the same characters as English words.
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Collapse multiple whitespace characters into a single space
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()
   
 # When I apply the function, I got "TypeError: 'float' object is not iterable" that some values are not string
# For debugging the error, I used Qwen 2.5,

In [47]:
df['Title'] = df['Title'].apply(clean)
df['Poem'] = df['Poem'].apply(clean)
df.head(15)

Unnamed: 0,Title,Poem,Genre
0,search,wandered tonight through a cityas ruined as a ...,Fear
1,a poem for the cruel majority,the cruel majority emergeshail to the cruel ma...,Anger
2,do not embrace your minds new negro friend,do not embrace your minds new negro friendor e...,Anger
3,the greatest love,she is sixty she livesthe greatest love of her...,Love
4,bilingualbilinge,my father liked them separate one thereone her...,Anger
5,the war films,o living pictures of the dead o songs without ...,Fear
6,of the education of children,i was planted a cropnot a boy but with headlon...,Sadness
7,from america america,god save america my home sweet home,Joy
8,poisonous plants of america,april foolbearsfootbogoniondevilsappledog pars...,Fear
9,morte darthur,so all day long the noise of battle rolld amon...,Fear


# Preprocessing

In [48]:
# Remove non English words
nltk.download('words')
words = set(nltk.corpus.words.words())
df['Title'] = df['Title'].apply(lambda x: ' '.join([word for word in x.split() if word in words]))
df['Poem'] = df['Poem'].apply(lambda x: ' '.join([word for word in x.split() if word in words]))

[nltk_data] Downloading package words to C:\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [49]:
df.shape

(10000, 3)

In [50]:
# I noticed some rows are empty and some rows have only few words, that dont make sense, for generating poem
# So I will remove those rows which have less than 5 words
df = df[df['Poem'].str.split().str.len() > 20]

In [51]:
df.shape

(9546, 3)

In [52]:
# Still the dataset is huge and it will take time to train the model, I will get this error again : 
# MemoryError: Unable to allocate 1.55 TiB for an array with shape (7140714, 29822) and data type float64
# I will  take 10% of the dataset for training
df = df.sample(frac=0.05, random_state=42).reset_index(drop=True)

In [53]:
df.head(15) 

Unnamed: 0,Title,Poem,Genre
0,inventor,the jay through the in color note down invent ...,Fear
1,from deaf republic happiness,alfonso to go hold him down with my smaller fr...,Joy
2,everyday,my poor all i ask of themis to grow antennae l...,Anger
3,small shame blues,i live with the small not knowing the multiple...,Fear
4,love song for,my man is my sweet lover of my life my youth a...,Joy
5,,have i spent too much time worrying about the ...,Love
6,the altar,a broken altar lord thy servant of a heart and...,Joy
7,my little,folding up my little within my heart praying i...,Fear
8,my wolf sister,when my tiny paper onto the carpet my wolf sis...,Anger
9,,i was restless and elsewhere gut dragging and ...,Love


In [54]:
# When I create single continues sequence, LSTM transition between words and across all poets which means LSTM learns general patterns from the entire dataset. 
# combine all poems into one corpus 
poems  = '\n'.join(df['Poem'].tolist())
# Print length of corpus 
print(poems[:100])
print(f'Total characters: {len(poems)}')

the jay through the in color note down invent outdoor so fall off in plaid in front of my window i f
Total characters: 443153


##### Tokenization

In [55]:
# Initializing the Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts([poems])

# Get the vocabulary size:
# converting the numerical output back into readable text, I need an inverse mapping.
# That maps integer indices back to their original words.
# I can create this by swapping the keys and values of tokenizer.word_index.

total_words = len(tokenizer.word_index)
vocab_size = len(tokenizer.word_index) + 1

# Print the result
print(f'Vocabulary size: {vocab_size}')
print('Total Words:', total_words)


Vocabulary size: 8737
Total Words: 8736


In [56]:
# Let's see a sample of the word index (mapping from word to ID)
print('Example word index mappings (word -> id):')
for word, idx in list(tokenizer.word_index.items())[:10]:
    print(word, '->', idx)
# I used LLM to print sample of words index 

Example word index mappings (word -> id):
the -> 1
and -> 2
of -> 3
a -> 4
to -> 5
in -> 6
i -> 7
my -> 8
that -> 9
with -> 10


In [57]:
# Previously I did not know that by running help I can ser the details and about its configuration and methods
# Source : https://kgptalkie.medium.com/poetry-generation-using-tensorflow-keras-and-lstm-75c4e4b7f07e
help(tokenizer)
# Getting the word index

# Tokenizer by default converts all the words to lower case and ignores punctuation.
# So I did not need to do that in the clean function.

Help on Tokenizer in module keras.src.legacy.preprocessing.text object:

class Tokenizer(builtins.object)
 |  Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, analyzer=None, **kwargs)
 |  
 |  DEPRECATED.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, analyzer=None, **kwargs)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  fit_on_sequences(self, sequences)
 |  
 |  fit_on_texts(self, texts)
 |  
 |  get_config(self)
 |  
 |  sequences_to_matrix(self, sequences, mode='binary')
 |  
 |  sequences_to_texts(self, sequences)
 |  
 |  sequences_to_texts_generator(self, sequences)
 |  
 |  texts_to_matrix(self, texts, mode='binary')
 |  
 |  texts_to_sequences(self, texts)
 |  
 |  texts_to_sequences_generator(self, texts)
 |  
 |  to_json(self, **kwargs

##### Encoding 

In [58]:
# For poetry generation model, I will create fixed length sequences of characters as input to the  LSTM model.
# This approach allows the model to learn the sequential dependencies between characters in the poems.
# the model to predict the next letter in a word or the next character in a line of poetry.
# To prepare my data in this way, I first need to convert the entire poetry text into a sequence of integer:   
encoded_poems = tokenizer.texts_to_sequences(poems)
print('Length of encoded poems:', len(encoded_poems))

Length of encoded poems: 443153


**Generating training sequences for LSTM**

In [59]:
# Next I will create the training sequences.
# 20 characters will be used as input and the next character will be the target.
# This process will continue until the end of the text.
# encoded_poems is list of lists, to get 20 characters, I will flatten the list of lists into a single list of characters.
all_tokens = [token for poem in encoded_poems for token in poem]
seq_len = 20
X = []
y = []
for i in range(0, len(all_tokens) - seq_len-1):
    seq_in = all_tokens[i:i + seq_len]
    seq_out = all_tokens[i + seq_len]
    X.append(seq_in)
    y.append(seq_out)   

# https://machinelearningmastery.com/text-generation-with-lstm-in-pytorch/

In [60]:
# I created the sequences of 20 characters, now I will convert them into numpy arrays.
# NumPy arrays can be easily to convert to tensors
X = np.array(X)
y = np.array(y)

In [61]:
# Before moving on, I will convert x and y to tensors since I will use PyTorch 
X_tensor = torch.tensor(X, dtype=torch.long)
y_tensor = torch.tensor(y, dtype=torch.long)

In [62]:
print(f'Type of X {type(X_tensor)} and Shape of X:{(X_tensor.shape)}')  
print(f'Type of y {type(y_tensor)} and Shape of y:{(y_tensor.shape)}')

Type of X <class 'torch.Tensor'> and Shape of X:torch.Size([305360, 20])
Type of y <class 'torch.Tensor'> and Shape of y:torch.Size([305360])


In [63]:
# Creating train and val dataset:
dataset = TensorDataset(X_tensor, y_tensor)
# val_dataset is 10% of the dataset.
train_size = int(len(dataset) * 0.9)  
val_size = len(dataset) - train_size
# Randomly split the dataset into training and validation
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print(f'Training samples: {len(train_dataset)}, Validation samples: {len(val_dataset)}')

Training samples: 274824, Validation samples: 30536


**Building the LSTM Model (PyTorch)**

In [64]:
# Now I have preprocessed the data, I will build the architecture of the LSTM model.
# The common way to create NN model in PyTorch is to create a class that inherits from nn.Module.
# Let's define PoetryLSTM
class PoetryLSTM(nn.Module):

    def __init__(self, vocab_size, embed_dim, hidden_size, num_layers, dropout_prob=0.5):
        super(PoetryLSTM, self).__init__()
        # FIRST LAYER, Embedding layers: 
        # This layer acts like a dictionary that converts each character ID into a meaningful code
        self.embedding = nn.Embedding(vocab_size, embed_dim) 
        # SECOND LAYERS:
        # LSTM Layer: In my case, the LSTM will read the sequence of word embeddings and produce an output
        self.lstm = nn.LSTM(embed_dim, hidden_size, num_layers, batch_first=True)
        # Dropout layer: randomly drops some values to reduce overfitting
        self.dropout = nn.Dropout(dropout_prob)
        # OUTPUT LAYER OR FULLY CONNECTED LAYER:
        # It maps the output to a score for each word in the vocab.
        # This doest explicitly apply softmax here, I will use CrossEntropyLoss which internally applies a softmax when comparing to the target word. 
        self.fc = nn.Linear(hidden_size, vocab_size)



     # The forward method:
     # It defines how the data flows through my model
    def forward(self, x, hidden):
        # Pass the input through the embedding layer
        embedded = self.embedding(x)
        # Pass the embedded input through the LSTM layer
        out, hidden = self.lstm(embedded, hidden)
        # Apply dropout to the LSTM output
        out = self.dropout(out)
        # Turns the 3D tensor into 2D so each row is one timestep.
        out = out.contiguous().view(-1, self.fc.in_features)
        # Pass the reshaped output through the fully connected layer 
        return out, hidden


      # These states are what allow the LSTM to maintain memory of past inputs in the sequence
    def init_hidden(self, batch_size):
        # Initialize the hidden and cell states for the LSTM
        weight = next(self.parameters()).data
        hidden = (
                  weight.new(self.lstm.num_layers, batch_size, self.lstm.hidden_size).zero_(),
                  weight.new(self.lstm.num_layers, batch_size, self.lstm.hidden_size).zero_()
                  )
        return hidden 

In [65]:
model = PoetryLSTM(vocab_size, embed_dim, hidden_size, num_layers).to(device) 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(model)

NameError: name 'device' is not defined

**Training loop**

In [None]:
# hyperparameters
embed_dim = 100
hidden_size = 256
num_layers = 2
learning_rate = 0.001
epochs = 20
batch_size = 64

In [None]:
|