In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

# Sample dataset (Replace this with your dataset)
english_sentences = ["hello", "how are you", "good morning", "thank you"]
japanese_sentences = ["こんにちは", "お元気ですか", "おはよう", "ありがとう"]

# Tokenize English sentences
tokenizer_eng = Tokenizer()
tokenizer_eng.fit_on_texts(english_sentences)
eng_vocab_size = len(tokenizer_eng.word_index) + 1
eng_sequences = tokenizer_eng.texts_to_sequences(english_sentences)
eng_max_len = max(len(seq) for seq in eng_sequences)
eng_padded = pad_sequences(eng_sequences, maxlen=eng_max_len, padding='post')

# Tokenize Japanese sentences
tokenizer_jap = Tokenizer()
tokenizer_jap.fit_on_texts(japanese_sentences)
jap_vocab_size = len(tokenizer_jap.word_index) + 1
jap_sequences = tokenizer_jap.texts_to_sequences(japanese_sentences)
jap_max_len = max(len(seq) for seq in jap_sequences)
jap_padded = pad_sequences(jap_sequences, maxlen=jap_max_len, padding='post')

# Encoder
encoder_inputs = Input(shape=(eng_max_len,))
enc_emb = Embedding(input_dim=eng_vocab_size, output_dim=128, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(jap_max_len,))
dec_emb = Embedding(input_dim=jap_vocab_size, output_dim=128, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(jap_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Prepare target data for training
y_train = np.expand_dims(jap_padded, axis=-1)

# Train the model
model.fit([eng_padded, jap_padded], y_train, batch_size=32, epochs=100)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 1.6150
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step - loss: 1.5921
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - loss: 1.5690
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - loss: 1.5454
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - loss: 1.5208
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - loss: 1.4946
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step - loss: 1.4664
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - loss: 1.4356
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - loss: 1.4018
Epoch 10/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - loss: 1.3644
Epoch 11/1

<keras.src.callbacks.history.History at 0x18f1832e2d0>

In [4]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = Embedding(input_dim=jap_vocab_size, output_dim=128, mask_zero=True)(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs2] + decoder_states2)

def translate_sentence(input_text):
    input_seq = tokenizer_eng.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=eng_max_len, padding='post')
    states_value = encoder_model.predict(input_seq)
    
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer_jap.word_index['<start>'] if '<start>' in tokenizer_jap.word_index else 1
    
    translated_sentence = ""
    for _ in range(jap_max_len):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = [word for word, index in tokenizer_jap.word_index.items() if index == sampled_token_index]
        if sampled_word:
            translated_sentence += sampled_word[0] + " "
        
        if sampled_word and sampled_word[0] == '<end>':
            break
        
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]
    
    return translated_sentence.strip()

# Example usage
print(translate_sentence("how are"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 161ms/step
お元気ですか
