In [None]:
import pandas as pd
import numpy as np
import string
import nltk
import re

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, random_split
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.nn.utils import clip_grad_norm_

[English Poem Dataset from Kaggle](https://www.kaggle.com/datasets/abdelrahmanekhaldi/english-poem-dataset)

In [None]:
# Loading the data
df = pd.read_csv('/content/drive/MyDrive/NLP-third_assignment /PoemDataset.csv/PoemDataset.csv')
df.head()

Unnamed: 0,Title,Poem,Poet,Genre
0,Search,Wandered tonight through a cityas ruined as a ...,Hester Knibbe,Fear
1,A Poem for the Cruel Majority,The cruel majority emerges!Hail to the cruel m...,Jerome Rothenberg,Anger
2,“Do Not Embrace Your Mind’s New Negro Friend”,Do not embrace your mind’s new negro friendOr ...,William Meredith,Anger
3,The Greatest Love,She is sixty. She livesthe greatest love of he...,Anna Swir,Love
4,Bilingual/Bilingüe,"My father liked them separate, one there,one h...",Rhina P. Espaillat,Anger


In [None]:
df.shape

(10000, 4)

In [None]:
#  Peot is not important to have it in dataset
df = df.drop(columns=['Poet'])
df.head(20)

Unnamed: 0,Title,Poem,Genre
0,Search,Wandered tonight through a cityas ruined as a ...,Fear
1,A Poem for the Cruel Majority,The cruel majority emerges!Hail to the cruel m...,Anger
2,“Do Not Embrace Your Mind’s New Negro Friend”,Do not embrace your mind’s new negro friendOr ...,Anger
3,The Greatest Love,She is sixty. She livesthe greatest love of he...,Love
4,Bilingual/Bilingüe,"My father liked them separate, one there,one h...",Anger
5,The War Films,"O living pictures of the dead, O songs without...",Fear
6,Of the Education of Children,"I was planted, a cropnot a boy but with headlo...",Sadness
7,"from America, America","God save America, My home, sweet...",Joy
8,Poisonous Plants of America,April foolBear's-footBog-onionDevil's-appleDog...,Fear
9,Morte d'Arthur,So all day long the noise of battle roll'd Amo...,Fear


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Title   9999 non-null   object
 1   Poem    9928 non-null   object
 2   Genre   10000 non-null  object
dtypes: object(3)
memory usage: 234.5+ KB


# Preprocessing

# Text cleaning

In [None]:
# what I can see, in title column.  I will define function to clean the text
def clean(text):
    if pd.isnull(text):
        return ''
    # In dataset, there are some chines characters, I will remove them, but I noticed that in dataset there are non English words as well , which have the same characters as English words.
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Collapse multiple whitespace characters into a single space
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()

 # When I apply the function, I got "TypeError: 'float' object is not iterable" that some values are not string
# For debugging the error, I used Qwen 2.5.

In [None]:
# Applying the function
df['Poem'] = df['Poem'].apply(clean)
df.head(15)

Unnamed: 0,Title,Poem,Genre
0,Search,wandered tonight through a cityas ruined as a ...,Fear
1,A Poem for the Cruel Majority,the cruel majority emergeshail to the cruel ma...,Anger
2,“Do Not Embrace Your Mind’s New Negro Friend”,do not embrace your minds new negro friendor e...,Anger
3,The Greatest Love,she is sixty she livesthe greatest love of her...,Love
4,Bilingual/Bilingüe,my father liked them separate one thereone her...,Anger
5,The War Films,o living pictures of the dead o songs without ...,Fear
6,Of the Education of Children,i was planted a cropnot a boy but with headlon...,Sadness
7,"from America, America",god save america my home sweet home,Joy
8,Poisonous Plants of America,april foolbearsfootbogoniondevilsappledog pars...,Fear
9,Morte d'Arthur,so all day long the noise of battle rolld amon...,Fear


In [None]:
# Remove non English words
# In dataset, there are some non English words.
# I used toolkit to filter non English words
# Load the NLTK
nltk.download('words')
# Words holds English words
words = set(nltk.corpus.words.words())
# lambda function for processing each poem
df['Poem'] = df['Poem'].apply(lambda x: ' '.join([word for word in x.split() if word in words]))

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [None]:
df.shape

(10000, 3)

In [None]:
# I noticed some rows are empty or have only few words, Since the dataset is huge
# So I will remove those rows which have less than 5 words and more than 100 words
df = df[(df['Poem'].str.split().str.len() > 5) & (df['Poem'].str.split().str.len() <= 200)]

In [None]:
df.shape

(7591, 3)

In [None]:
# Still the dataset is huge and it will take time to train the model, I will get this error again :
# MemoryError: Unable to allocate xxxx TiB for an array with shape (7140714, 29822) and data type float64
# I will  take small piece of the dataset for training
df = df.sample(frac=0.09, random_state=42).reset_index(drop=True)

In [None]:
df.shape

(683, 3)

In [None]:
# Let's check the variation of title
df['Title'].value_counts()

Unnamed: 0_level_0,count
Title,Unnamed: 1_level_1
Cherries,2
My Brother,2
Eros of Heroines,1
Industrial Lace,1
The Universe: Original Motion Picture Soundtrack,1
...,...
Dilemma,1
Have A Good One [Just wasted],1
For Freckle-Faced Gerald,1
Live Blindly and upon the Hour,1


In [None]:
# let's check the variation of Genre
df['Genre'].value_counts()

Unnamed: 0_level_0,count
Genre,Unnamed: 1_level_1
Fear,211
Joy,159
Sadness,118
Anger,77
Surprise,61
Love,57


In [None]:
# When I create sigle continues sequence, LSTM transition between words and across all poets which means LSTM learns general patterns from the entire dataset.
poem  = df['Poem'].tolist()
# Total characters in poem
print(f'Total characters: {len(poem)}')

Total characters: 683


**Tokenization and Vocabular**


In [None]:
# I will tokenize peom. I use tokenizer to assign integer to each unique word.
tokenizer = Tokenizer(oov_token= 'OOV')
tokenizer.fit_on_texts(poem)

# Get the vocabulary size:
# converting the numerical output back into readable text, I need an inverse mapping.
# That maps integer indices back to their original words.
# I can create this by swapping the keys and values of tokenizer.word_index.

total_words = len(tokenizer.word_index)
vocab_size = len(tokenizer.word_index) + 1

# Print the result
print(f'Vocabulary size: {vocab_size}')
print('Total Words:', total_words)

Vocabulary size: 7135
Total Words: 7134


In [None]:
# Let's see a sample of the word index (mapping from word to ID)
print('Example word index mappings (word -> id):')
for word, idx in list(tokenizer.word_index.items())[:10]:
    print(word, '->', idx)
# I used Qwen 2.5 to print sample of the words index.

Example word index mappings (word -> id):
OOV -> 1
the -> 2
of -> 3
a -> 4
and -> 5
to -> 6
in -> 7
i -> 8
is -> 9
my -> 10


In [None]:
# Previously I did not know that by running help I can ser the details and about its configuration and methods
# Source : https://kgptalkie.medium.com/poetry-generation-using-tensorflow-keras-and-lstm-75c4e4b7f07e
help(tokenizer)
# Tokenizer by default converts all the words to lower case and ignores punctuation.
# So I did not need to do that in the clean function.

Help on Tokenizer in module keras.src.legacy.preprocessing.text object:

class Tokenizer(builtins.object)
 |  Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, analyzer=None, **kwargs)
 |  
 |  DEPRECATED.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, analyzer=None, **kwargs)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  fit_on_sequences(self, sequences)
 |  
 |  fit_on_texts(self, texts)
 |  
 |  get_config(self)
 |  
 |  sequences_to_matrix(self, sequences, mode='binary')
 |  
 |  sequences_to_texts(self, sequences)
 |  
 |  sequences_to_texts_generator(self, sequences)
 |  
 |  texts_to_matrix(self, texts, mode='binary')
 |  
 |  texts_to_sequences(self, texts)
 |  
 |  texts_to_sequences_generator(self, texts)
 |  
 |  to_json(self, **kwargs

**Creating n gram Sequences for Training**



In [None]:
# I am going to create training sequences using the tokenized data. For each line of the poem,
# I generate multiple n grams, that will be used as training samples to predict the next word

In [None]:
# In each sequence the last word is the target.
# Empty list to collect all generated n gram sequences
input_sequences = []
# Generate n gram sequences for each line in the poem
for line in poem:
    # Convert the text line into a sequence of integer tokens
    token_list = tokenizer.texts_to_sequences([line])[0]
    # Generate n grams:
    # i starts from 1 to not get empty sequence
    for i in range(1, len(token_list)):

        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)
print('Total training sequences (n grams):', len(input_sequences))
# I got the main code structure from lecture note

Total training sequences (n grams): 64059


**Padding Sequences and Creating Training Data (X, y)**

In [None]:
# I need to pad them so they all have the same length.
max_len = max(len(seq) for seq in input_sequences)
print('The longest sequence lenght is:',max_len)
padded = tf.keras.preprocessing.sequence.pad_sequences(
            input_sequences, maxlen=max_len, padding='pre')
# Oh makes sense because I filtered the dataset, so even I could initialize 100 to max_len.
# I got the main code structure from lecture note

The longest sequence lenght is: 200


In [None]:
padded = np.array(padded)
X = padded[:, :-1]
y = padded[:,  -1]
print("Shape of X (features):", X.shape)
print("Shape of y (labels):", y.shape)
# In longest sequence target value is including and input value is max_len - 1
# The first 99 tokens of each sequence (X) are the input words
# The 100th token (y) is the word I want the model to predict.

Shape of X (features): (64059, 199)
Shape of y (labels): (64059,)


In [None]:
# I will convert X and y to PyTorch tensors, since I am gonne  use PyTorch for the LSTM
# Convert to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.long)
y_tensor = torch.tensor(y, dtype=torch.long)

# Let's check the tensors
print(f'Type of X {type(X_tensor)} and Shape of X:{(X_tensor.shape)}')
print(f'Type of y {type(y_tensor)} and Shape of y:{(y_tensor.shape)}')

Type of X <class 'torch.Tensor'> and Shape of X:torch.Size([64059, 199])
Type of y <class 'torch.Tensor'> and Shape of y:torch.Size([64059])


In [None]:
# I split the data into a validation to evaluate the model
dataset = TensorDataset(X_tensor, y_tensor)
# I use 80% for training, 20% for validation
train_size = int(len(dataset) * 0.1)
val_size = len(dataset) - train_size

gen = torch.Generator().manual_seed(42)
train_dataset, val_dataset = random_split(dataset, [train_size, val_size], generator=gen)
print(f'Training samples: {len(train_dataset)}, Validation samples: {len(val_dataset)}')

Training samples: 6405, Validation samples: 57654


In [None]:
# Create DataLoader for batching
BATCH_SIZE = 64
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

**Building the LSTM Model (PyTorch)**

In [None]:
# I will define an LSTM based neural network using PyTorch.

class PoetryLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, lstm_units, num_layers, dropout_prob, padding_idx=0):
        super(PoetryLSTM, self).__init__()
        # Embedding Layer: The first is embedding layer which takes word IDs and look up a vector.
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim, padding_idx=padding_idx)

        # LSTM layer: the core of my recurrent neural network
        self.lstm = nn.LSTM(input_size=embed_dim,hidden_size=lstm_units,num_layers=num_layers, batch_first=True, dropout=dropout_prob)

        # Fully connected layer: maps the LSTM output to the vocabulary size
        self.fc = nn.Linear(lstm_units, vocab_size )


      # First, X is passed through the embedding layer to get a sequence of embedding vectors
      # This sequence is then fed into the LSTM layerm
      # I apply dropout to the output of the LSTM to prevent overfitting

    def forward(self, x):
        emb = self.embedding(x)
        # Process input through LSTM:
        # h_n = hidden stat
        # c_n = cell state
        output, (h_n, c_n) = self.lstm(emb)
        last_hidden = h_n[-1]
        logits = self.fc(last_hidden)
        return logits


# The Embedding learns to place similar words nearby in vector space
# LSTM captures patterns and context over time
# The last hidden state  remebers: what the model has read so far
# The fc transforms that summary over the vocabulary, letting my model predict what comes next



# Set hyperparameters
vocab_size = len(tokenizer.word_index)
embed_dim = 30
lstm_units = 512
num_layers = 3
dropout_prob = 0.1
padding_idx = 0

# Initialize the model
model = PoetryLSTM( vocab_size=vocab_size +1 , embed_dim=embed_dim, lstm_units=lstm_units,
                   num_layers=num_layers,dropout_prob=dropout_prob, padding_idx=padding_idx)

model.to(device)

# Sources:
# https://machinelearningmastery.com/text-generation-with-lstm-in-pytorch/

PoetryLSTM(
  (embedding): Embedding(7135, 30, padding_idx=0)
  (lstm): LSTM(30, 512, num_layers=3, batch_first=True, dropout=0.1)
  (fc): Linear(in_features=512, out_features=7135, bias=True)
)

 6 599 vectors of dimension 150, where index 0 is fixed to zero for padding.<br>
 Three layer LSTM 150 D inputs and 352 units, applying dropout (0.15) between layers.<br>batch_first=True
Input and output tensors have shape (batch, seq_len, feature).

**Training the LSTM Model**

In [None]:
# Loss function, scheduler and optimizer:
# padding tokens (index 0) preventing them from affecting gradient updates
criterion = nn.CrossEntropyLoss(ignore_index=padding_idx)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5)

In [None]:
# Early stopping:
best_val_loss = float('inf')
epochs_no_improve = 0
patience = 20

In [None]:
epochs = 50

for epoch in range(1, epochs+1):
    model.train()
    total_loss = 0

    for X_batch, y_batch in train_loader:
        X_batch, y_batch =  X_batch.to(device), y_batch.to(device)
        # Clear previous gradients
        optimizer.zero_grad()
        # Forward pass
        outputs = model(X_batch)
        # Calculate loss
        loss = criterion(outputs, y_batch)
        loss.backward()
        # stabilize training
        clip_grad_norm_(model.parameters(), max_norm=1.0)
        # Update weights
        optimizer.step()

        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)

    # Validation:
    # Disable training features
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for val_X, val_y in val_loader:
            val_X, val_y = val_X.to(device), val_y.to(device)

            val_out = model(val_X)
            val_loss += criterion(val_out, val_y).item()
    avg_val_loss = val_loss / len(val_loader)
    # Adjust learning rate
    scheduler.step(avg_val_loss)
    print(f'Epoch {epoch}: train_loss={avg_loss:.4f} || val_loss={avg_val_loss:.4f}')

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print('Training finished.')
            break
# For degbbuing I used Gemini to understand the errors

Epoch 1: train_loss=7.0311 || val_loss=6.7126


 Training loss keeps falling but the validation loss goes upward because the model might be memorising the small training data faster than it learns patterns that generalise.<br>

In [None]:
# Save the model
torch.save(model.state_dict(), 'poetry_model.pth')


**Generating Poetry with the Trained Model**

In [None]:
# using trained LSTM model to generate poetry
# Function to generate poerty
def generate_text(seed, next_words=15):
    model.eval()  # Set the model to evaluation mode
    current_text = seed.lower()  # Start with the seed text

    for _ in range(next_words):
        # Convert current text to tokens
        token_list = tokenizer.texts_to_sequences([current_text])[0]

        # Pad the sequence to the required length
        token_list_padded = pad_sequences([token_list], maxlen=max_len, padding='pre')

        # Convert to tensor
        input_tensor = torch.tensor(token_list_padded, dtype=torch.long).to(device)

        # Predict next word
        with torch.no_grad():
            output = model(input_tensor)

        # Get the index of the highest probability word
        predicted_index = torch.argmax(output, dim=1).item()

        # Map the predicted index to a word
        predicted_word = tokenizer.index_word.get(predicted_index)

        # Append the predicted word to the current text
        current_text += " " + predicted_word

    return current_text

In [None]:
# Generating poetry
# Generate a poem line starting with a seed
seed = "the home"
poem_line = generate_text(seed, next_words=15)
print("Generated poem line:")
print(poem_line)

According to my experience working with a BERT model for detecting toxic comments and LSTM model for generating poetry in the PyTorch framework, the BERT assignment looks easier. <br>
Because the pre trained BERT comes with its own tokenizer, so I skip most of the data preprocessing that LSTM needs. Cleaning the raw text, removing weird characters is pretty much for both, but the LSTM needs extra such as building sequance. With BERT, once the text is clean. I just call BertTokenizer; pad/truncate, and fine tune.