In [3]:
import pandas as pd
import numpy as np
import string
import nltk
import re

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, random_split
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.nn.utils import clip_grad_norm_
import torch.nn.functional as F
from google.colab import files

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


[English Poem Dataset from Kaggle](https://www.kaggle.com/datasets/abdelrahmanekhaldi/english-poem-dataset)

In [5]:
# Loading the data
df = pd.read_csv('/content/drive/MyDrive/NLP-third_assignment -20250504T164430Z-1-001/NLP-third_assignment/PoemDataset.csv/PoemDataset.csv')
df.head()

Unnamed: 0,Title,Poem,Poet,Genre
0,Search,Wandered tonight through a cityas ruined as a ...,Hester Knibbe,Fear
1,A Poem for the Cruel Majority,The cruel majority emerges!Hail to the cruel m...,Jerome Rothenberg,Anger
2,“Do Not Embrace Your Mind’s New Negro Friend”,Do not embrace your mind’s new negro friendOr ...,William Meredith,Anger
3,The Greatest Love,She is sixty. She livesthe greatest love of he...,Anna Swir,Love
4,Bilingual/Bilingüe,"My father liked them separate, one there,one h...",Rhina P. Espaillat,Anger


In [6]:
df.shape

(10000, 4)

In [7]:
#  Peot is not important to have it in dataset
df = df.drop(columns=['Poet'])
df.head(20)

Unnamed: 0,Title,Poem,Genre
0,Search,Wandered tonight through a cityas ruined as a ...,Fear
1,A Poem for the Cruel Majority,The cruel majority emerges!Hail to the cruel m...,Anger
2,“Do Not Embrace Your Mind’s New Negro Friend”,Do not embrace your mind’s new negro friendOr ...,Anger
3,The Greatest Love,She is sixty. She livesthe greatest love of he...,Love
4,Bilingual/Bilingüe,"My father liked them separate, one there,one h...",Anger
5,The War Films,"O living pictures of the dead, O songs without...",Fear
6,Of the Education of Children,"I was planted, a cropnot a boy but with headlo...",Sadness
7,"from America, America","God save America, My home, sweet...",Joy
8,Poisonous Plants of America,April foolBear's-footBog-onionDevil's-appleDog...,Fear
9,Morte d'Arthur,So all day long the noise of battle roll'd Amo...,Fear


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Title   9999 non-null   object
 1   Poem    9928 non-null   object
 2   Genre   10000 non-null  object
dtypes: object(3)
memory usage: 234.5+ KB


# Preprocessing

# Text cleaning

In [9]:
# what I can see, in title column.  I will define function to clean the text
def clean(text):
    if pd.isnull(text):
        return ''
    # In dataset, there are some chines characters, I will remove them, but I noticed that in dataset there are non English words as well , which have the same characters as English words.
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Collapse multiple whitespace characters into a single space
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()

 # When I apply the function, I got "TypeError: 'float' object is not iterable" that some values are not string
# For debugging the error, I used Qwen 2.5.

In [10]:
# Applying the function
df['Poem'] = df['Poem'].apply(clean)
df.head(15)

Unnamed: 0,Title,Poem,Genre
0,Search,wandered tonight through a cityas ruined as a ...,Fear
1,A Poem for the Cruel Majority,the cruel majority emergeshail to the cruel ma...,Anger
2,“Do Not Embrace Your Mind’s New Negro Friend”,do not embrace your minds new negro friendor e...,Anger
3,The Greatest Love,she is sixty she livesthe greatest love of her...,Love
4,Bilingual/Bilingüe,my father liked them separate one thereone her...,Anger
5,The War Films,o living pictures of the dead o songs without ...,Fear
6,Of the Education of Children,i was planted a cropnot a boy but with headlon...,Sadness
7,"from America, America",god save america my home sweet home,Joy
8,Poisonous Plants of America,april foolbearsfootbogoniondevilsappledog pars...,Fear
9,Morte d'Arthur,so all day long the noise of battle rolld amon...,Fear


In [11]:
# Remove non English words
# In dataset, there are some non English words.
# I used toolkit to filter non English words
# Load the NLTK
nltk.download('words')
# Words holds English words
words = set(nltk.corpus.words.words())
# lambda function for processing each poem
df['Poem'] = df['Poem'].apply(lambda x: ' '.join([word for word in x.split() if word in words]))

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [12]:
df.shape

(10000, 3)

In [13]:
# I noticed some rows are empty or have only few words, Since the dataset is huge
# So I will remove those rows which have less than 5 words and more than 100 words
df = df[(df['Poem'].str.split().str.len() > 5) & (df['Poem'].str.split().str.len() <= 200)]

In [14]:
df.shape

(7591, 3)

In [15]:
# Still the dataset is huge and it will take time to train the model, I will get this error again :
# MemoryError: Unable to allocate xxxx TiB for an array with shape (7140714, 29822) and data type float64
# I will  take small piece of the dataset for training
df = df.sample(frac=0.5, random_state=42).reset_index(drop=True)

In [16]:
df.shape

(3796, 3)

In [17]:
# Let's check the variation of title
df['Title'].value_counts()

Unnamed: 0_level_0,count
Title,Unnamed: 1_level_1
Song,9
Summer,5
Autumn,4
Sonnet,4
Sustenance,3
...,...
Drunk Judgment,1
Blackberrying,1
Blind Curse,1
Aubade﻿,1


In [18]:
# let's check the variation of Genre
df['Genre'].value_counts()

Unnamed: 0_level_0,count
Genre,Unnamed: 1_level_1
Fear,1185
Joy,931
Sadness,620
Anger,448
Surprise,328
Love,284


In [19]:
# When I create sigle continues sequence, LSTM transition between words and across all poets which means LSTM learns general patterns from the entire dataset.
poem  = df['Poem'].tolist()
# Total characters in poem
print(f'Total characters: {len(poem)}')

Total characters: 3796


**Tokenization and Vocabular**


In [20]:
# I will tokenize peom. I use tokenizer to assign integer to each unique word.
tokenizer = Tokenizer()
tokenizer.fit_on_texts(poem)

# Get the vocabulary size:
# converting the numerical output back into readable text, I need an inverse mapping.
# That maps integer indices back to their original words.
# I can create this by swapping the keys and values of tokenizer.word_index.

total_words = len(tokenizer.word_index)
vocabulary_size = len(tokenizer.word_index) + 1

# Print the result
print(f'Vocabulary size: {vocabulary_size}')
print('Total Words:', total_words)

Vocabulary size: 16424
Total Words: 16423


In [21]:
# Let's see a sample of the word index (mapping from word to ID)
print('Example word index mappings (word -> id):')
for word, idx in list(tokenizer.word_index.items())[:10]:
    print(word, '->', idx)
# I used Qwen 2.5 to print sample of the words index.

Example word index mappings (word -> id):
the -> 1
of -> 2
a -> 3
and -> 4
to -> 5
in -> 6
i -> 7
my -> 8
is -> 9
you -> 10


In [22]:
# Previously I did not know that by running help I can ser the details and about its configuration and methods
# Source : https://kgptalkie.medium.com/poetry-generation-using-tensorflow-keras-and-lstm-75c4e4b7f07e
help(tokenizer)
# Tokenizer by default converts all the words to lower case and ignores punctuation.
# So I did not need to do that in the clean function.

Help on Tokenizer in module keras.src.legacy.preprocessing.text object:

class Tokenizer(builtins.object)
 |  Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, analyzer=None, **kwargs)
 |  
 |  DEPRECATED.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, analyzer=None, **kwargs)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  fit_on_sequences(self, sequences)
 |  
 |  fit_on_texts(self, texts)
 |  
 |  get_config(self)
 |  
 |  sequences_to_matrix(self, sequences, mode='binary')
 |  
 |  sequences_to_texts(self, sequences)
 |  
 |  sequences_to_texts_generator(self, sequences)
 |  
 |  texts_to_matrix(self, texts, mode='binary')
 |  
 |  texts_to_sequences(self, texts)
 |  
 |  texts_to_sequences_generator(self, texts)
 |  
 |  to_json(self, **kwargs

**Creating n gram Sequences for Training**



In [23]:
# I am going to create training sequences using the tokenized data. For each line of the poem,
# I generate multiple n grams, that will be used as training samples to predict the next word

In [24]:
# In each sequence the last word is the target.
# Empty list to collect all generated n gram sequences
input_sequences = []
# Generate n gram sequences for each line in the poem
for line in poem:
    # Convert the text line into a sequence of integer tokens
    token_list = tokenizer.texts_to_sequences([line])[0]
    # Generate n grams:
    # i starts from 1 to not get empty sequence
    for i in range(1, len(token_list)):

        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)
print('Total training sequences (n grams):', len(input_sequences))
# I got the main code structure from lecture note

Total training sequences (n grams): 355809


**Padding Sequences and Creating Training Data (X, y)**

In [25]:
# I need to pad them so they all have the same length.
max_len = max(len(seq) for seq in input_sequences)
print('The longest sequence lenght is:',max_len)
padded = tf.keras.preprocessing.sequence.pad_sequences(
            input_sequences, maxlen=max_len, padding='pre')
# Oh makes sense because I filtered the dataset, so even I could initialize 200 to max_len.
# I got the main code structure from lecture note

The longest sequence lenght is: 200


In [26]:
padded = np.array(padded)
X = padded[:, :-1]
y = padded[:,  -1]
print("Shape of X (features):", X.shape)
print("Shape of y (labels):", y.shape)
# In longest sequence target value is including and input value is max_len - 1
# The first 199 tokens of each sequence (X) are the input words
# The 200th token (y) is the word I want the model to predict.

Shape of X (features): (355809, 199)
Shape of y (labels): (355809,)


In [27]:
# I will convert X and y to PyTorch tensors, since I am gonne  use PyTorch for the LSTM
# Convert to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.long)
y_tensor = torch.tensor(y, dtype=torch.long)

# Let's check the tensors
print(f'Type of X {type(X_tensor)} and Shape of X:{(X_tensor.shape)}')
print(f'Type of y {type(y_tensor)} and Shape of y:{(y_tensor.shape)}')

Type of X <class 'torch.Tensor'> and Shape of X:torch.Size([355809, 199])
Type of y <class 'torch.Tensor'> and Shape of y:torch.Size([355809])


In [28]:
# I split the data into a validation to evaluate the model
dataset = TensorDataset(X_tensor, y_tensor)
# I use 80% for training, 20% for validation
val_size = int(len(dataset) * 0.2)
train_size = len(dataset) - val_size

gen = torch.Generator().manual_seed(42)
train_dataset, val_dataset = random_split(dataset, [train_size, val_size], generator=gen)
print(f'Training samples: {len(train_dataset)}, Validation samples: {len(val_dataset)}')

Training samples: 284648, Validation samples: 71161


In [29]:
# Create DataLoader for batching
BATCH_SIZE = 64
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [30]:
# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

**Building the LSTM Model (PyTorch)**

In [38]:
# I will define an LSTM based neural network using PyTorch.
class PoetryLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, lstm_units, n_layers, dropout):
        super().__init__()

        # Embedding Layer: The first is embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        # LSTM layer:
        self.lstm = nn.LSTM(embed_dim, lstm_units, n_layers, dropout = dropout , batch_first=True,)
        # Fully connected layer:
        self.fc = nn.Linear(lstm_units, vocab_size )


    # Forward method:
    def forward(self, x):
      emb = self.embedding(x)
      output_seq, _ = self.lstm(emb)
      batch_size, seq_len, hidden_size = output_seq.size()
      flat = output_seq.contiguous().view(-1, hidden_size)
      flat_logits = self.fc(flat)
      logits = flat_logits.view(batch_size, seq_len, -1)
      return logits

# In Forward, model can learn the poem structure and predicts next words
# I created kind of chain, where you train the model to predict next words
# So for each training, I feed the model 199 words and y is 200th.
# In Forward is defined that model predicts right after 199th word
# In train function, I will compare predicton with true next word
# https://machinelearningmastery.com/text-generation-with-lstm-in-pytorch/

# For me, making proper forward method was tricky, At the beginning,  I didn't know how exactly to create forward
# In some part for debugging, I used Qwen

In [None]:
# Now I need to make function to manage the training loop
def train_function (model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0.0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        # Clear old Gradients
        optimizer.zero_grad()
        # Forward pass:
        outputs = model(X)
        last_logits = outputs[:, -1, :]
        loss = criterion(last_logits, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# In this loop model predicts the 200th word of in sequence
# last_logits = outputs[:, -1, :], I pick the prediction which is y

In [33]:
# To monitor the model, I need to create a validation function
def validate_function(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0.0
    # No backwar() just forward passes and loss computations
    with torch.no_grad():
        for X, y in val_loader:
            X, y = X.to(device), y.to(device)
            outputs = model(X)
            last_logits = outputs[:, -1, :]
            loss = criterion(last_logits, y)
            total_loss += loss.item()
    return total_loss / len(val_loader)
# Here I validation function to uses the exact same forward logic but never updates weights
# it  measures how well the model predicts unseen data.

In [34]:
def train(model, train_loader, val_loader, device):
  # Early stopping
    no_improve = 0
    patience = 3
    epochs = 20
    best_val = 0
    history = {'train_loss': [], 'val_loss': []}

    # Set up loss, optimizer, scheduler
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.005)
    scheduler   = ReduceLROnPlateau(optimizer, mode='min', factor=0.2, patience=patience)


    for epoch in range(1, epochs):
        train_loss = train_function(model, train_loader, criterion, optimizer, device)
        val_loss = validate_function(model, val_loader,   criterion, device)
        # Scheduler updates learning rate if no improvement
        scheduler.step(val_loss)

        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)

        print(f"[Epoch {epoch}/{epochs}] train={train_loss:.4f} val={val_loss:.4f}")
        # Early stopping logic
        if val_loss < best_val:
            best_val, no_improve = val_loss, 0
            torch.save(model.state_dict(), 'best_lstm.pt')
        else:
            no_improve += 1
            if no_improve >= patience:
                print("Early stopping.")
                break
    return history
    # Scheduler sees val_loss to for learning rate adjusting
    # If val_loss improves, reset no_improve and save the model.

In [35]:
# Set hyperparameters
vocab_size = vocabulary_size
embed_dim = 100
lstm_units = 256
n_layers = 2
dropout = 0.3

In [36]:
# Initialize the model
model = PoetryLSTM(vocab_size , embed_dim, lstm_units, n_layers, dropout).to(device)
print(model)

PoetryLSTM(
  (embedding): Embedding(16424, 100)
  (lstm): LSTM(100, 256, num_layers=2, batch_first=True, dropout=0.3)
  (fc): Linear(in_features=256, out_features=16424, bias=True)
)


In [37]:
# Run training
train(model, train_loader, val_loader, device)

KeyboardInterrupt: 

In [None]:
# Save model
torch.save(model.state_dict(), 'LSTM.pth')

# Download the saved model
files.download('LSTM.pth')

In [None]:
def generate_poem(model, tokenizer, seed_text, max_len, gen_length,temperature= 1, device='cpu'):

    model.eval()
    # Text to token id
    seq = tokenizer.texts_to_sequences([seed_text])[0]

    for _ in range(gen_length):
        padded = pad_sequences([seq],maxlen=max_len,padding='pre')
        inp = torch.tensor(padded[:, :-1], dtype=torch.long, device=device)



        with torch.no_grad():
            logits = model(inp)
        last_logits = logits[0, -1, :]
        # converts logits to probabilities
        probs = F.softmax(last_logits / temperature, dim=-1)
        next_id = torch.multinomial(probs, num_samples=1).item()

        seq.append(next_id)

  # Convert all token id back into words, skipping zeros
    words = []
    for idx in seq:
        if idx == 0:
            continue
        words.append(tokenizer.index_word.get(idx, '<unk>'))

    return ' '.join(words)
  # temperature controls randomness in sampling
  # Here I did padding and called forward method exactly the same way as in training/validation
  # I sliced off the final prediction, matching last_logits = outputs[:, -1, :] in training.

**Generating Poetry with the Trained Model**

In [None]:
model.to(device)
seed = 'Your smile'
poem = generate_poem(model,tokenizer,seed_text=seed,max_len=200,
                     gen_length=20,temperature=0.8,device=device)
print(poem)

I built many short sequences ending in a target value.<br>
forward method defines how a batch of token sequences turns into prediction scores.
train_function uses that forward logic to perform parameter updates on each mini batch.

validate function uses the same forward logic to measure generalization without weights.

train function sequences these steps across epochs, adjusts learning rates, saves the best model.


According to my experience working with a BERT model for detecting toxic comments and LSTM model for generating poetry in the PyTorch framework, the BERT assignment looks easier. <br>
The pre trained BERT comes with its own tokenizer, so I skip most of the data preprocessing that LSTM needs. Cleaning the raw text, removing weird characters is pretty much for both, but the LSTM needs extra such as building sequance. With BERT, once the text is clean. I just call BertTokenizer; pad/truncate, and fine tune.

**I changed some parameters's values and I have tried to retrain the model, but I dont have credits to use GPU.**