In [None]:
import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'poemsdataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F974990%2F1648795%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240322%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240322T082153Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D46cf03a9cf31cbd96f47b33847f5b4913d2052d6df07553a22e10c88d185cbd291f94e4cc1d90cbc2c49b211c8790f31a440d7f7d555e6f540caabbe3a1ed2449cf946c5d7172f98e94d6472e5d810d7f1fdc26c8874dc819cfddd5a6d728205fa26a4d10796715ce98ac2698357921319f98fbeabfd315d244e092302b3326f883171fec2d17b1554e9c9d5aeb348f0c57e4dc08071c3130dd3ea9a905cd033ce9d632ca4b2f71d3479e614dd6756e70f6b8e54476ed6109a37a33db5da47fbb61cf8e67e4461deeae03f6028eeb591f9f8d023cb6aa628068b08f8d49cecdfc9b785f5e157919ca7d67c34b144a2ff8117818c8cdd61a74686a164e34e9efa'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading poemsdataset, 16279790 bytes compressed
Data source import complete.


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import os

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import *

In [None]:
root_dir = "/kaggle/input/poemsdataset/topics"
corpus = []
corpus_size = 10000
done = False

print("Loading poems in corpus...\n")
for dirname, _, filenames in os.walk(root_dir):
    if done: break
    print(f"Loading {dirname}")
    for filename in filenames:
        if done: break
        with open(os.path.join(dirname, filename), "r") as file:
            txt = file.read()
            for line in txt.split("\n"):
                if done: break
                corpus.append(line)
                if len(corpus) == corpus_size:
                    done = True

Loading poems in corpus...

Loading /kaggle/input/poemsdataset/topics
Loading /kaggle/input/poemsdataset/topics/frog
Loading /kaggle/input/poemsdataset/topics/money
Loading /kaggle/input/poemsdataset/topics/paris
Loading /kaggle/input/poemsdataset/topics/city


In [None]:
len(corpus)

10000

In [None]:
corpus[:10]

['I have a pet frog',
 'We go frog walking every day at 6pm',
 'He is fine until he sees another frog',
 'He sniffs & then tries to jump',
 'I pull him away',
 'Last week we had a problem',
 'My frog stopped to do his business',
 'A passer bye said stop your frog fouling',
 'I cleared up after him but it left a slimy mark',
 'The other man slipped on it']

In [None]:
import string

def remove_punc(s):
    return s.translate(str.maketrans('', '', string.punctuation))

corpus = [ remove_punc(s.lower().strip()) for s in corpus ]

In [None]:
corpus[:10]

['i have a pet frog',
 'we go frog walking every day at 6pm',
 'he is fine until he sees another frog',
 'he sniffs  then tries to jump',
 'i pull him away',
 'last week we had a problem',
 'my frog stopped to do his business',
 'a passer bye said stop your frog fouling',
 'i cleared up after him but it left a slimy mark',
 'the other man slipped on it']

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

In [None]:
vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 11747


In [None]:
n_grams = []
max_sequence_len = 0

for sentence in corpus:
    # convert sentence to tokens
    tokens = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(2, len(tokens)+1):
        # extract n-gram
        n_gram = tokens[:i]
        # save n-gram
        n_grams.append(n_gram)
        # calculate maximum sequence length
        if len(n_gram) > max_sequence_len:
            max_sequence_len = len(n_gram)

print(f"Number of n-grams: {len(n_grams)}")
print(f"Maximum n-gram length: {max_sequence_len}")

Number of n-grams: 55930
Maximum n-gram length: 166


In [None]:
for n_gram in n_grams[:10]:
    print(n_gram)

[8, 38]
[8, 38, 5]
[8, 38, 5, 2903]
[8, 38, 5, 2903, 30]
[26, 91]
[26, 91, 30]
[26, 91, 30, 580]
[26, 91, 30, 580, 99]
[26, 91, 30, 580, 99, 76]
[26, 91, 30, 580, 99, 76, 32]


## Pad n-grams

In [None]:
padded_n_grams = np.array(pad_sequences(n_grams, maxlen=100, padding="pre", truncating="pre"))

padded_n_grams.shape

(55930, 100)

In [None]:
for seq in padded_n_grams[:3]:
    print(seq)

[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  8 38]
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  8 38  5]
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0   

In [None]:
X = padded_n_grams[:, :-1]
y = padded_n_grams[:, -1]

print(f"X: {X.shape}")
print(f"y: {y.shape}")

X: (55930, 99)
y: (55930,)


In [None]:
# one hot encode y

y = tf.keras.utils.to_categorical(y, num_classes=vocab_size)

print(f"y: {y.shape}")

y: (55930, 11747)


In [None]:
from tensorflow.keras import regularizers
model = tf.keras.Sequential([
    Embedding(vocab_size, 300, input_length=99),
    Bidirectional(LSTM(150, return_sequences = True)),
    Dropout(0.2),
    LSTM(100),
    Dense(vocab_size/2, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    Dense(vocab_size, activation='softmax'),
])
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 99, 300)           3524100   
                                                                 
 bidirectional (Bidirection  (None, 99, 300)           541200    
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 99, 300)           0         
                                                                 
 lstm_1 (LSTM)               (None, 100)               160400    
                                                                 
 dense (Dense)               (None, 5873)              593173    
                                                                 
 dense_1 (Dense)             (None, 11747)             69001878  
                                                        

In [None]:
model.fit(
    X,
    y,
    epochs=150,
    batch_size=128,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor="loss", patience=20)
    ],
)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
import numpy as np

model.save('model.h5')

In [None]:
import matplotlib.pyplot as plt

# Plot learning curves

In [None]:
hist = model.history.history

plt.figure(figsize=(10, 8))

plt.subplot(2, 1, 1)
plt.title("Loss")
plt.plot(hist["loss"])
plt.ylabel("Loss")
plt.grid(True)

plt.subplot(2, 1, 2)
plt.title("Accuracy")
plt.plot(hist["accuracy"], color="orange")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.grid(True)

plt.show()

# Poetry generation

In [None]:
def generate(seed_text, next_words):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=99, padding="pre")
        predicted = np.argmax(model.predict(token_list, verbose=0))
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    print(seed_text)

In [None]:
generate("Nature", 50)

In [None]:
def generate_poem(genre, title, next_words=100):

    seed_text = genre + " " + title  # Combine genre and title for the seed text
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=99, padding="pre")
        predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    print(seed_text)

# Example usage:
# First, load your model (if it's not already in memory)
model = load_model('model.h5')  # Adjust the path as needed

# Generate a poem with a given genre and title
generate_poem("Fairy-tales", "Tales from the Enchanted Forest", 100)
