# **Dataset Preparation**

In [1]:
import pandas as pd
import sentencepiece as spm
import torch
from torch.nn.utils.rnn import pad_sequence

In [10]:
df = pd.read_csv('./data/diffused_sonnets_2.csv')

df['Variation Text'] = df['Variation Text'].str.replace('\n', ' <LINE> ')

print(df.shape)

df.head()

(460, 3)


Unnamed: 0,Sonnet Number,Variation Number,Variation Text
0,1,1,"Desire for growth in loveliest of beings, <LIN..."
1,1,2,"In the fairest of beings, we crave increase, <..."
2,1,3,"For fairest beings, we yearn for more to grace..."
3,2,1,As forty winters carve their icy lines on your...
4,2,2,When winter's hand has etched its stories on y...


# **1. Simple Tokenization**

In [7]:
# 1. Prepare the training data for SentencePiece: join all sonnets with newlines
combined_text = '\n'.join(df['Variation Text'].tolist())

# 2. Write combined text to a temporary file (SentencePiece expects a file input for training)
with open('./data/sonnets_train.txt', 'w', encoding='utf-8') as f:
    f.write(combined_text)

# 3. Train SentencePiece tokenizer from the file
spm.SentencePieceTrainer.train(
    input='./data/sonnets_train.txt',
    model_prefix='./tokenizer/my_tokenizer',
    vocab_size=4000, 
    user_defined_symbols=['<LINE>', '<PAD>']
)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: ./data/sonnets_train.txt
  input_format: 
  model_prefix: ./tokenizer/my_tokenizer
  model_type: UNIGRAM
  vocab_size: 4000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  user_defined_symbols: <LINE>
  user_defined_symbols: <PAD>
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_p

In [3]:
# 4. Load the trained tokenizer
sp = spm.SentencePieceProcessor()
sp.load('./tokenizer/my_tokenizer.model')

# 5. Tokenize each sonnet in the DataFrame
def tokenize_sonnet(text):
    return sp.encode_as_ids(text)

df['tokenized_sonnet'] = df['Variation Text'].apply(tokenize_sonnet)

# Example: print tokenized pieces for the first sonnet
print(df.loc[0, 'tokenized_sonnet'])

[1989, 30, 3528, 12, 2144, 104, 15, 1489, 6, 5, 3, 45, 239, 49, 9, 7, 236, 50, 1091, 20, 201, 6, 5, 3, 101, 21, 3718, 1284, 381, 6, 244, 76, 76, 2732, 6, 5, 3, 25, 639, 15, 2003, 16, 3482, 8, 5, 3, 61, 18, 6, 699, 16, 13, 60, 130, 145, 6, 5, 3, 64, 3015, 13, 92, 9, 7, 257, 35, 143, 79, 1434, 1892, 6, 5, 3, 2540, 20, 863, 12, 21, 610, 15, 953, 9, 7, 1998, 6, 5, 3, 105, 1992, 2635, 13, 60, 143, 6, 262, 182, 246, 8, 5, 3, 146, 6, 80, 10, 227, 9, 7, 216, 517, 328, 2139, 290, 6, 5, 3, 19, 31, 3033, 20, 83, 15, 451, 12, 62, 43, 504, 6, 5, 3, 24, 13, 60, 1084, 6, 214, 13, 152, 6, 5, 3, 19, 1705, 54, 977, 6, 158, 707, 974, 7, 3432, 8, 5, 3, 474, 67, 1095, 51, 10, 119, 6, 71, 46, 1894, 706, 757, 608, 6, 5, 3, 229, 230, 612, 87, 9, 7, 5, 2963, 6, 32, 417, 17, 99, 8, 5, 3, 5, 3]


In [4]:
df.loc[0, "tokenized_sonnet"].count(sp.encode('<LINE>')[0])
df.loc[0, "Variation Text"].count('\n')

# Get the average amount of  slash n of all Variation Text
avg_newlines = df['Variation Text'].str.count('<LINE>').mean()

print(f"Average number of newlines in Variation Text: {avg_newlines}")

Average number of newlines in Variation Text: 14.826086956521738


In [5]:
def pad_tokenized_sonnets(tokenized_sonnets, pad_id=-1, max_len=256):

    # Truncate each token list to max_len, then convert to tensor
    token_tensors = [torch.tensor(tokens[:max_len]) for tokens in tokenized_sonnets]
    
    # Pad sequences to the max length in the batch (≤ max_len)
    padded_tensor = pad_sequence(token_tensors, batch_first=True, padding_value=pad_id)
    
    # If padding length is less than max_len, pad extra manually
    if padded_tensor.size(1) < max_len:
        pad_size = max_len - padded_tensor.size(1)
        pad_tensor = torch.zeros((padded_tensor.size(0), pad_size), dtype=padded_tensor.dtype)
        padded_tensor = torch.cat([padded_tensor, pad_tensor], dim=1)
    
    return padded_tensor

# Pad the tokenized sonnets
padded_sonnets = pad_tokenized_sonnets(df['tokenized_sonnet'].tolist(), pad_id=sp.piece_to_id('<PAD>'))

# Example: print the shape of the padded tensor
print(padded_sonnets.shape)  # Should be (batch_size, max_seq_len)

torch.Size([460, 256])


In [6]:
# Prepare input X and target y:
# X: tokenized sonnets padded to (batch_size, 256)
X = padded_sonnets

# y: X shifted left by 1 (next token prediction), pad last token with 0 or ignore_index
y = torch.zeros_like(X)
y[:, :-1] = X[:, 1:]
y[:, -1] = sp.pad_id()  # or any padding token id to ignore last token prediction

In [7]:
X, y

(tensor([[1989,   30, 3528,  ...,    0,    0,    0],
         [  24,   10,  210,  ...,    0,    0,    0],
         [  42,  210,  104,  ...,    0,    0,    0],
         ...,
         [  24,  335,  753,  ...,    0,    0,    0],
         [  24,  186,    5,  ...,    0,    0,    0],
         [  24, 2270,  240,  ...,    0,    0,    0]]),
 tensor([[  30, 3528,   12,  ...,    0,    0,   -1],
         [  10,  210,  104,  ...,    0,    0,   -1],
         [ 210,  104, 1489,  ...,    0,    0,   -1],
         ...,
         [ 335,  753,    6,  ...,    0,    0,   -1],
         [ 186,    5,  491,  ...,    0,    0,   -1],
         [2270,  240,    6,  ...,    0,    0,   -1]]))

In [12]:
# Save X and y tensors to a file
torch.save((X, y), './data/sonnets_data.pt')

# **Rhyme Tags**

In [13]:
# Define rhyme scheme tokens
rhyme_classes = ['A', 'B', 'C', 'D', 'E', 'F', 'G']

def add_rhyme_tokens(sonnet_text, rhyme_scheme="ABABCDCDEFEFGG", target_lines=14):
    """
    Insert rhyme tokens at the end of each line of the sonnet based on rhyme scheme.
    Pads or truncates sonnets to `target_lines` lines.
    
    sonnet_text: str, full sonnet text with lines separated by <LINE> tokens
    rhyme_scheme: str, e.g. Shakespeare’s 14-line sonnet scheme
    target_lines: int, number of lines to pad/truncate to (default 14)
    
    Returns: sonnet text with rhyme tokens inserted at line ends, padded/truncated to target_lines
    """
    # Split sonnet into lines
    lines = sonnet_text.split('<LINE>')
    lines = [line.strip() for line in lines if line.strip()]  # Remove empty lines
    
    # Pad with empty lines if too short
    if len(lines) < target_lines:
        lines += [''] * (target_lines - len(lines))
    # Truncate if too long
    elif len(lines) > target_lines:
        lines = lines[:target_lines]
    
    # Adjust rhyme scheme length to target_lines
    rhyme_scheme = rhyme_scheme[:target_lines]
    
    # Append rhyme token at end of each line
    lines_with_rhyme = [
        line + f" <rhyme_{rhyme_scheme[i]}>" for i, line in enumerate(lines)
    ]
    
    # Re-join with <LINE>
    return " <LINE> ".join(lines_with_rhyme)

In [14]:
df["rhyme_sonnet"] = df["Variation Text"].apply(add_rhyme_tokens)
df.head()

Unnamed: 0,Sonnet Number,Variation Number,Variation Text,tokenized_sonnet,rhyme_sonnet
0,1,1,"Desire for growth in loveliest of beings, <LIN...","[1989, 30, 3528, 12, 2144, 104, 15, 1489, 6, 5...","Desire for growth in loveliest of beings, <rhy..."
1,1,2,"In the fairest of beings, we crave increase, <...","[24, 10, 210, 104, 15, 1489, 6, 112, 1018, 750...","In the fairest of beings, we crave increase, <..."
2,1,3,"For fairest beings, we yearn for more to grace...","[42, 210, 104, 1489, 6, 112, 763, 30, 109, 16,...","For fairest beings, we yearn for more to grace..."
3,2,1,As forty winters carve their icy lines on your...,"[101, 30, 820, 2751, 1242, 47, 5, 3647, 228, 5...",As forty winters carve their icy lines on your...
4,2,2,When winter's hand has etched its stories on y...,"[91, 276, 9, 7, 163, 144, 5, 1271, 43, 1378, 5...",When winter's hand has etched its stories on y...


In [15]:
with open('./data/sonnets_rhymes_train.txt', 'w', encoding='utf-8') as f:
    for sonnet in df['rhyme_sonnet']:
        f.write(sonnet + '\n')


In [16]:
rhyme_tokens = [f"<rhyme_{c}>" for c in rhyme_classes]

user_defined_symbols = ['<LINE>', "<PAD>"] + rhyme_tokens

spm.SentencePieceTrainer.train(
    input='./data/sonnets_rhymes_train.txt',
    model_prefix='./tokenizer/my_rhyme_tokenizer',
    vocab_size=4000,
    user_defined_symbols=user_defined_symbols,
)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: ./data/sonnets_rhymes_train.txt
  input_format: 
  model_prefix: ./tokenizer/my_rhyme_tokenizer
  model_type: UNIGRAM
  vocab_size: 4000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  user_defined_symbols: <LINE>
  user_defined_symbols: <PAD>
  user_defined_symbols: <rhyme_A>
  user_defined_symbols: <rhyme_B>
  user_defined_symbols: <rhyme_C>
  user_defined_symbols: <rhyme_D>
  user_defined_symbols: <rhyme_E>
  user_defined_symbols: <rhyme_F>
  user_defined_symbols: <rhyme_G>
  required_chars: 

In [17]:
# 4. Load the trained tokenizer
sp = spm.SentencePieceProcessor()
sp.load('./tokenizer/my_rhyme_tokenizer.model')

# 5. Tokenize each sonnet in the DataFrame
def tokenize_sonnet(text):
    return sp.encode_as_ids(text)

df['tokenized_rhyme_sonnet'] = df['rhyme_sonnet'].apply(tokenize_sonnet)

In [18]:
max_len = df['tokenized_rhyme_sonnet'].apply(len).max()

# Pad the tokenized sonnets
padded_sonnets = pad_tokenized_sonnets(df['tokenized_rhyme_sonnet'].tolist(), pad_id=sp.piece_to_id('<PAD>'), max_len=max_len)

# Example: print the shape of the padded tensor
print(padded_sonnets.shape)  # Should be (batch_size, max_seq_len)

torch.Size([460, 277])


In [19]:
X = padded_sonnets  # torch.LongTensor

# y: X shifted left by 1 (next token prediction), pad last token with 0 or ignore_index
y = torch.zeros_like(X)
y[:, :-1] = X[:, 1:]
y[:, -1] = sp.pad_id()  # or any padding token id to ignore last token prediction

In [20]:
X, y

(tensor([[1993,   37, 3529,  ...,    4,    4,    4],
         [  31,   17,  214,  ...,    4,    4,    4],
         [  49,  214,  113,  ...,    4,    4,    4],
         ...,
         [  31,  340,  717,  ...,    4,    4,    4],
         [  31,  194,   12,  ...,    4,    4,    4],
         [  31, 2269,  243,  ...,    4,    4,    4]]),
 tensor([[  37, 3529,   19,  ...,    4,    4,   -1],
         [  17,  214,  113,  ...,    4,    4,   -1],
         [ 214,  113, 1485,  ...,    4,    4,   -1],
         ...,
         [ 340,  717,   13,  ...,    4,    4,   -1],
         [ 194,   12,  495,  ...,    4,    4,   -1],
         [2269,  243,   13,  ...,    4,    4,   -1]]))

In [None]:
print(X.shape)

In [21]:
# Save X and y tensors to a file
torch.save((X, y), './data/sonnets_rhymes_data.pt')