**Word Level Tokenize Model**

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the dataset
file_path = "dataset.csv"#"roman-urdu-poetry.csv"
data = pd.read_csv(file_path)

# Extract poetry text while preserving line breaks
text = "\n".join(data["Poetry"].dropna().tolist())
print(f"Dataset length: {len(text)} characters")

Dataset length: 700115 characters


In [None]:
# === Tokenize text with newlines included === #
tokenizer = Tokenizer(filters='')  # Don't filter punctuation or special characters, including newlines
tokenizer.fit_on_texts([text])

vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary Size: {vocab_size}")

# Convert text to sequences of word indices
text_sequences = tokenizer.texts_to_sequences([text])[0]

Vocabulary Size: 16224


In [None]:
max_verse_length = data['Poetry'].dropna().apply(
    lambda poem: max([len(verse.strip().split()) for verse in poem.split('\n')], default=0)
).max()

print("Maximum verse length (in words):", max_verse_length)

Maximum verse length (in words): 23


In [None]:
# === Create Input-Target Sequences === #
seq_length = max_verse_length
inputs = []
targets = []

for i in range(seq_length, len(text_sequences)):
    inputs.append(text_sequences[i - seq_length:i])
    targets.append(text_sequences[i])

inputs = np.array(inputs)
targets = np.array(targets)

In [None]:
# === Build the Model === #
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=256, input_length=seq_length),
    tf.keras.layers.LSTM(1024, return_sequences=False),
    tf.keras.layers.Dense(vocab_size, activation='softmax')
])
model.build(input_shape=(None, seq_length))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [None]:
# === Train Model === #
BATCH_SIZE = 256
EPOCHS = 15
history = model.fit(inputs, targets, batch_size=BATCH_SIZE, epochs=EPOCHS)

Epoch 1/15
[1m498/498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 78ms/step - accuracy: 0.0364 - loss: 7.5656
Epoch 2/15
[1m498/498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 83ms/step - accuracy: 0.0523 - loss: 6.8335
Epoch 3/15
[1m498/498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 84ms/step - accuracy: 0.0828 - loss: 6.4464
Epoch 4/15
[1m498/498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 84ms/step - accuracy: 0.1155 - loss: 5.9376
Epoch 5/15
[1m498/498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 84ms/step - accuracy: 0.1524 - loss: 5.2577
Epoch 6/15
[1m498/498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 84ms/step - accuracy: 0.2166 - loss: 4.3843
Epoch 7/15
[1m498/498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 84ms/step - accuracy: 0.3595 - loss: 3.3943
Epoch 8/15
[1m498/498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 84ms/step - accuracy: 0.5128 - loss: 2.5254
Epoch 9/15
[1m498/498[

In [None]:
# Save Model
model_save_path = "poetry_model1.h5"
model.save(model_save_path)



In [None]:
model_save_path1 = "roman_urdu_poetry_model.keras"
model.save(model_save_path1)

In [None]:
# === Text Generation with Newline Support ===
def generate_text(model, tokenizer, seed_text, num_generate=50, seq_length=10, temperature=1.0):
    generated_text = seed_text

    for _ in range(num_generate):
        # Tokenize the seed text
        token_list = tokenizer.texts_to_sequences([generated_text])[0][-seq_length:]
        token_list = pad_sequences([token_list], maxlen=seq_length, padding='pre')

        # Predict the next word
        predictions = model.predict(token_list, verbose=0)
        predictions = predictions / temperature
        #predicted_id = tf.random.categorical(tf.math.log(predictions), num_samples=1).numpy()[0][0]
        predicted_id = tf.argmax(predictions, axis=-1).numpy()[0]
        # Skip invalid predictions
        if predicted_id == 0:
            continue

        # Append predicted word to the generated text
        predicted_word = tokenizer.index_word[predicted_id]


        # Append a newline if predicted word contains a newline token
        if predicted_word == "\n":
            generated_text += "\n"
        else:
          generated_text += " " + predicted_word

    return generated_text

# Generate poetry with new lines
seed_text = "ishq"
generated_poetry = generate_text(model, tokenizer, seed_text=seed_text, num_generate=50)
print("\nGenerated Poetry with New Lines:\n", generated_poetry)



Generated Poetry with New Lines:
 ishq ke saath 
tū ne kuchh is se kī sūrat nahīñ 
is taraf ab tak koī nahīñ hai jahāñ meñ 
jo dil kī tarah se mitā jaane hai 
yahī jo thā us ne ki mar jaa.e to achchhā kyā ho 
ye bhī sach hai ki ulfat meñ pareshāñ na ho 
ye


In [None]:
import gradio as gr
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd

# Load the dataset
file_path = "dataset.csv"
data = pd.read_csv(file_path)
text = "\n".join(data["Poetry"].dropna().tolist())

# Tokenize text with newlines included
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts([text])
vocab_size = len(tokenizer.word_index) + 1

# Load the trained model
model_save_path = "roman_urdu_poetry_model.keras"
model = tf.keras.models.load_model(model_save_path)

# Determine the maximum verse length
max_verse_length = data['Poetry'].dropna().apply(
    lambda poem: max([len(verse.strip().split()) for verse in poem.split('\n')], default=0)
).max()
seq_length = max_verse_length

# Text generation function
def generate_text(seed_text, num_generate=50, temperature=1.0):
    generated_text = seed_text
    for _ in range(num_generate):
        token_list = tokenizer.texts_to_sequences([generated_text])[0][-seq_length:]
        token_list = pad_sequences([token_list], maxlen=seq_length, padding='pre')
        predictions = model.predict(token_list, verbose=0)
        predictions = predictions / temperature
        predicted_id = tf.argmax(predictions, axis=-1).numpy()[0]
        if predicted_id == 0:
            continue
        predicted_word = tokenizer.index_word.get(predicted_id, '')
        if predicted_word == "\n":
            generated_text += "\n"
        else:
            generated_text += " " + predicted_word
    return generated_text

# Create Gradio interface
iface = gr.Interface(
    fn=generate_text,
    inputs=[
        gr.Textbox(label="Seed Text", placeholder="Enter a seed word for poetry"),
        gr.Slider(minimum=10, maximum=200, step=10, value=50, label="Number of Words"),
        gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=1.0, label="Temperature")
    ],
    outputs=gr.Textbox(label="Generated Poetry"),
    title="Roman Urdu Poetry Generator",
    description="Enter a seed word and generate poetry in Roman Urdu with newline support."
)

iface.launch()


  saveable.load_own_variables(weights_store.get(inner_path))


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://44c56461f117026651.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
import tensorflow as tf

# === Build the Model === #
model2 = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=256, input_length=seq_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(1024, return_sequences=False)),
    tf.keras.layers.Dense(vocab_size, activation='softmax')
])

model2.build(input_shape=(None, seq_length))
model2.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model2.summary()




In [None]:
# === Train Model === #
BATCH_SIZE = 256
EPOCHS = 15
history = model2.fit(inputs, targets, batch_size=BATCH_SIZE, epochs=EPOCHS)

Epoch 1/15
[1m498/498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 161ms/step - accuracy: 0.0353 - loss: 7.5623
Epoch 2/15
[1m498/498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 161ms/step - accuracy: 0.0542 - loss: 6.7861
Epoch 3/15
[1m498/498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 162ms/step - accuracy: 0.0863 - loss: 6.2287
Epoch 4/15
[1m498/498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 162ms/step - accuracy: 0.1314 - loss: 5.2956
Epoch 5/15
[1m498/498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 162ms/step - accuracy: 0.2831 - loss: 3.8525
Epoch 6/15
[1m498/498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 162ms/step - accuracy: 0.5044 - loss: 2.3986
Epoch 7/15
[1m498/498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 162ms/step - accuracy: 0.7210 - loss: 1.3281
Epoch 8/15
[1m498/498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 162ms/step - accuracy: 0.8946 - loss: 0.5943
Epoch 9/15
[1m4

In [None]:
model_save_path2 = "roman_urdu_poetry_model_lstm.keras"
model2.save(model_save_path2)

In [None]:
# === Text Generation with Newline Support ===
def generate_text(model, tokenizer, seed_text, num_generate=50, seq_length=10, temperature=1.0):
    generated_text = seed_text

    for _ in range(num_generate):
        # Tokenize the seed text
        token_list = tokenizer.texts_to_sequences([generated_text])[0][-seq_length:]
        token_list = pad_sequences([token_list], maxlen=seq_length, padding='pre')

        # Predict the next word
        predictions = model.predict(token_list, verbose=0)
        predictions = predictions / temperature
        #predicted_id = tf.random.categorical(tf.math.log(predictions), num_samples=1).numpy()[0][0]
        predicted_id = tf.argmax(predictions, axis=-1).numpy()[0]
        # Skip invalid predictions
        if predicted_id == 0:
            continue

        # Append predicted word to the generated text
        predicted_word = tokenizer.index_word[predicted_id]


        # Append a newline if predicted word contains a newline token
        if predicted_word == "\n":
            generated_text += "\n"
        else:
          generated_text += " " + predicted_word

    return generated_text

# Generate poetry with new lines
seed_text = "ishq main"
generated_poetry = generate_text(model2, tokenizer, seed_text=seed_text, num_generate=50)
print("\nGenerated Poetry with New Lines:\n", generated_poetry)



Generated Poetry with New Lines:
 ishq main meñ aur chāñd phuul kī ishq meñ 
raat ko kyā hai jahāñ ik dariyā se hai tan se jo jaanā hai 
ishq kī taraf se shīshe meñ jo hai na dil meñ hai mohtasib us ne jo raaz kiyā hai jo raaz jo chhoḍ kar tujhe jaane vaale bhī ham
