In [None]:
pip install numpy pandas tensorflow keras


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from sklearn.model_selection import train_test_split
import os

# Load data
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.read().strip().split('\n')
    data = []
    for line in lines:
        parts = line.split('\t')
        if len(parts) >= 2:
            src = '\t' + parts[1] + '\n'
            tgt = parts[0]
            data.append((src, tgt))
    return zip(*data)

# Paths
data_path = "/content/drive/MyDrive/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv"
input_texts, target_texts = load_data(data_path)

# Create vocabulary
input_characters = sorted(set(''.join(input_texts)))
target_characters = sorted(set(''.join(target_texts)))

input_token_index = {char: i+1 for i, char in enumerate(input_characters)}  # +1 for masking
target_token_index = {char: i+1 for i, char in enumerate(target_characters)}  # +1 for masking
reverse_target_char_index = {i: char for char, i in target_token_index.items()}

max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

# Vectorize data
encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length), dtype='int32')
decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length), dtype='int32')
decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, len(target_token_index)+1), dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t] = input_token_index.get(char, 0)
    for t, char in enumerate(target_text):
        decoder_input_data[i, t] = target_token_index.get(char, 0)
        if t > 0:
            decoder_target_data[i, t-1, target_token_index.get(char, 0)] = 1.0

# Split
enc_train, enc_val, dec_in_train, dec_in_val, dec_tar_train, dec_tar_val = train_test_split(
    encoder_input_data, decoder_input_data, decoder_target_data, test_size=0.1)

# Define model
embedding_dim = 128
latent_dim = 256
num_encoder_tokens = len(input_token_index)
num_decoder_tokens = len(target_token_index)

# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(num_encoder_tokens+1, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(latent_dim, return_state=True)(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb = Embedding(num_decoder_tokens+1, embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens+1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train
model.fit([enc_train, dec_in_train], dec_tar_train,
          batch_size=64,
          epochs=20,
          validation_data=([enc_val, dec_in_val], dec_tar_val))

# Inference Models
encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
dec_states_inputs = [decoder_state_input_h, decoder_state_input_c]
dec_emb2 = dec_emb(decoder_inputs)
dec_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=dec_states_inputs)
dec_states2 = [state_h2, state_c2]
dec_outputs2 = decoder_dense(dec_outputs2)
decoder_model = Model([decoder_inputs] + dec_states_inputs, [dec_outputs2] + dec_states2)

# Decode function
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = target_token_index['\t']  # Start char

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index.get(sampled_token_index, '')
        decoded_sentence += sampled_char

        if (sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()

# Example test
for i in range(5):
    input_seq = encoder_input_data[i:i+1]
    decoded = decode_sequence(input_seq)
    print(f"Input: {input_texts[i][1:-1]} | Predicted: {decoded} | True: {target_texts[i]}")


In [8]:
with open(f"/content/drive/MyDrive/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv", encoding='utf-8') as f:
    lines = f.read().strip().split('\n')
    print(lines[:10])

['अंकन\tankan\t3', 'अंगकोर\tangkor\t3', 'अंगिरा\tangira\t3', 'अंगीठी\tangithi\t3', 'अंग्रेज\tangrej\t3', 'अंग्रेजों\tangrejon\t4', 'अंजाम\tanjaam\t2', 'अंजाम\tanjam\t1', 'अंतकरण\tantakaran\t1', 'अंतकरण\tantkaran\t2']


In [6]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample lyrics corpus (replace this with your own dataset or text file)
lyrics = """
You are my fire
The one desire
Believe when I say
I want it that way
But we are two worlds apart
Can't reach to your heart
When you say that I want it that way
"""

# Preprocessing
tokenizer = Tokenizer()
tokenizer.fit_on_texts([lyrics])
total_words = len(tokenizer.word_index) + 1

# Create input sequences
input_sequences = []
for line in lyrics.strip().split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram = token_list[:i+1]
        input_sequences.append(n_gram)

# Pad sequences and create predictors and label
max_seq_len = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = to_categorical(y, num_classes=total_words)

# Build model
model = Sequential()
model.add(Embedding(total_words, 64, input_length=max_seq_len-1))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=200, verbose=0)  # Set verbose=1 if you want to see training progress

# Generate lyrics
def generate_lyrics(seed_text, next_words=10, temperature=1.0):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
        predictions = model.predict(token_list, verbose=0).flatten()
        predictions = np.log(predictions + 1e-8) / temperature
        exp_preds = np.exp(predictions)
        predictions = exp_preds / np.sum(exp_preds)
        predicted_word_index = np.random.choice(range(total_words), p=predictions)

        output_word = ''
        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                output_word = word
                break
        seed_text += ' ' + output_word
    return seed_text


# Try generating
print(generate_lyrics("I want", next_words=3, temperature=0.8))



I want it that way
