In [None]:
# Step 1: Install required libraries
!pip install tensorflow numpy

# Step 2: Import libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import io
from google.colab import files

# Step 3: Enable TPU
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU:', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
except ValueError:
    print('TPU not found. Using GPU/CPU.')
    strategy = tf.distribute.get_strategy()

# Step 4: Upload and load the dataset
uploaded = files.upload()
uploaded_file_name = list(uploaded.keys())[0]
print(f"Uploaded file: {uploaded_file_name}")

df = pd.read_csv(io.BytesIO(uploaded[uploaded_file_name]))

# Identify correct column
print("Column names in the dataset:", df.columns)
poetry_column_name = df.columns[0]
print(f"Using column '{poetry_column_name}' for poetry data.")

# Preprocess poetry data
poetry_data = df[poetry_column_name].dropna().astype(str).tolist()

# Step 5: Tokenization
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')  # Remove punctuation
tokenizer.fit_on_texts(poetry_data)
total_words = len(tokenizer.word_index) + 1

# Convert text to sequences
input_sequences = []
for line in poetry_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Set a longer max sequence length for better context
max_sequence_len = 60
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

# Split input (X) and output (y)
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

# Convert y to categorical
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Step 6: Build an improved LSTM model
with strategy.scope():
    model = Sequential([
        Embedding(total_words, 300, input_length=max_sequence_len-1),  # Larger embedding
        Bidirectional(LSTM(512, return_sequences=True)),  # Bidirectional LSTM
        Dropout(0.3),
        LSTM(256),  # Second LSTM layer
        Dense(total_words, activation='softmax')  # Softmax for word prediction
    ])

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 7: Train the model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

early_stopping = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=1e-6)

model.fit(X, y, epochs=100, batch_size=512, callbacks=[early_stopping, reduce_lr], verbose=1)

# Step 8: Implement Beam Search for Poetry Generation
def beam_search_predictions(seed_text, next_words, max_sequence_len, beam_width=3):
    seed_text = seed_text.lower()
    sequences = [(seed_text, 1.0)]  # Initialize with base sequence

    for _ in range(next_words):
        all_candidates = []
        for seq, score in sequences:
            token_list = tokenizer.texts_to_sequences([seq])[0]
            token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

            preds = model.predict(token_list, verbose=0)[0]
            top_indices = np.argsort(preds)[-beam_width:]  # Get top `beam_width` words

            for word_index in top_indices:
                new_seq = seq + " " + tokenizer.index_word.get(word_index, "")
                new_score = score * preds[word_index]
                all_candidates.append((new_seq, new_score))

        # Select the best sequences based on score
        sequences = sorted(all_candidates, key=lambda x: x[1], reverse=True)[:beam_width]

    return sequences[0][0]  # Return the best sequence

# Step 9: Generate a full ghazal
def generate_ghazal(seed_text, num_couplets, max_sequence_len, beam_width=3):
    ghazal = []
    used_lines = set()

    for _ in range(num_couplets):
        couplet = []
        for _ in range(2):  # Two-line couplet
            line = beam_search_predictions(seed_text, next_words=10, max_sequence_len=max_sequence_len, beam_width=beam_width)
            while line in used_lines:  # Avoid repetition
                line = beam_search_predictions(seed_text, next_words=10, max_sequence_len=max_sequence_len, beam_width=beam_width)
            used_lines.add(line)
            couplet.append(line)
            seed_text = line.split()[-1]  # Use last word as new seed
        ghazal.append("\n".join(couplet))

    return "\n\n".join(ghazal)

# Step 10: Get user input and generate ghazal
seed_text = input("Enter a seed word (e.g., 'dil'): ")
num_couplets = int(input("Enter the number of couplets to generate: "))

generated_ghazal = generate_ghazal(seed_text, num_couplets, max_sequence_len)

print("\nGenerated Ghazal:\n")
print(generated_ghazal)

# Step 11: Save the improved model
model.save("shairi_generator_v2.h5")
print("\nModel saved as 'shairi_generator_v2.h5'")


Note: you may need to restart the kernel to use updated packages.
TPU not found. Using GPU/CPU.



[notice] A new release of pip available: 22.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


FileUpload(value=(), accept='.csv', description='Upload')