In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('/content/cleaned_filtered_poetry.csv')

# Inspect the dataset
print(df.head())
print(df.columns)

# Clean the dataset (assuming there might be missing values)
df.dropna(inplace=True)
df = df.reset_index(drop=True)

# Assuming the poetry column contains the text (update this as needed)
poetry_column = df.columns[0]  # Update if necessary
poetry = df[poetry_column].astype(str).tolist()

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(poetry)
total_words = len(tokenizer.word_index) + 1

# Create input sequences
input_sequences = []
for line in poetry:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad sequences
max_sequence_len = 100
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

# Split input and output
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Build GRU Model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(GRU(150, return_sequences=True))
model.add(GRU(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

batch_size=32
# Train the model
history = model.fit(X, y, epochs=100, verbose=1,batch_size=batch_size,validation_split=0.1)

# Function to generate poetry
def generate_poetry(seed_text, next_words=20):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Example usage
seed_text = "mohabbat ik"
print(generate_poetry(seed_text, next_words=10))


                                               shers
0  aaj  ik  aur  baras  biit  gayā  us  ke  baġha...
1  mujhe  duniyā  ke  ta.anoñ  par  kabhī  ġhussa...
2  mujhe  duniya  ke  tanon  par  kabhi  ghussa  ...
3  aaj  ik  aur  baras  biit  gayā  us  ke  baġha...
4  miir  kyā  saade  haiñ  bīmār  hue  jis  ke  s...
Index(['shers'], dtype='object')




Epoch 1/100
[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 14ms/step - accuracy: 0.0401 - loss: 7.1344 - val_accuracy: 0.0318 - val_loss: 6.9761
Epoch 2/100
[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 13ms/step - accuracy: 0.0385 - loss: 6.7230 - val_accuracy: 0.0318 - val_loss: 7.1343
Epoch 3/100
[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 13ms/step - accuracy: 0.0393 - loss: 6.6743 - val_accuracy: 0.0450 - val_loss: 6.9345
Epoch 4/100
[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 14ms/step - accuracy: 0.0577 - loss: 6.4246 - val_accuracy: 0.0584 - val_loss: 6.9168
Epoch 5/100
[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 14ms/step - accuracy: 0.0683 - loss: 6.2811 - val_accuracy: 0.0636 - val_loss: 6.9743
Epoch 6/100
[1m1045/1045[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 13ms/step - accuracy: 0.0777 - loss: 6.0860 - val_accuracy: 0.0638 - val_loss: 7.034

In [2]:
model.save('/content/roman_urdu_poetry_model.h5')




In [3]:
from tensorflow.keras.models import load_model

# Load the trained model
model = load_model('/conteroman_urdu_poetry_model.h5')




In [5]:
import gradio as gr

# Function to generate poetry
def generate_poetry(seed_text, next_words=20):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Create Gradio Interface
gr.Interface(
    fn=generate_poetry,
    inputs=[
        gr.Textbox(label="Enter Seed Text"),
        gr.Slider(minimum=10, maximum=30, step=1,label="Select Poetry Length")  # Slider added
    ],
    outputs=gr.Textbox(label="Generated Poetry"),
    title="Roman Urdu Poetry Generator",
    description="Enter a seed phrase, select the length of the poetry, and generate a poem in Roman Urdu."
).launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://056566ac8da03440ba.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [1]:
%pip install streamlit


Collecting streamlit
  Using cached streamlit-1.42.0-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Using cached altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting blinker<2,>=1.0.0 (from streamlit)
  Using cached blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting cachetools<6,>=4.0 (from streamlit)
  Using cached cachetools-5.5.1-py3-none-any.whl.metadata (5.4 kB)
Collecting click<9,>=7.0 (from streamlit)
  Using cached click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting pillow<12,>=7.1.0 (from streamlit)
  Using cached pillow-11.1.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.1 kB)
Collecting pyarrow>=7.0 (from streamlit)
  Using cached pyarrow-19.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting tenacity<10,>=8.1.0 (from streamlit)
  Using cached tenacity-9.0.0-py3-none-any.whl.metadata (1.2 kB)
Collecting toml<2,>=0.10.1 (from streamlit)
  Using cached toml-0.10.2-py2.py3-none-any.whl.metadata (7.

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
to=Tokenizer()

In [4]:
token=Tokenizer()
print(token)

<keras.src.legacy.preprocessing.text.Tokenizer object at 0x72cf2cb08230>
