In [2]:
!pip install tensorflow nltk




Data Collection and Preprocessing

In [4]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Sample data (extend this with your own dataset)
english_texts = ["Hello world", "How are you?", "Good morning", "Good night", "Thank you"]
amharic_texts = ["ሰላም ልዑል", "እንዴት ነህ?", "እንኳን ደህና አደረህ", "መልካም ሌሊት", "አመሰግናለሁ"]

# Function to preprocess texts
def preprocess_texts(texts):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, padding='post')
    return padded_sequences, tokenizer

# Preprocess English and Amharic texts
english_sequences, english_tokenizer = preprocess_texts(english_texts)
amharic_sequences, amharic_tokenizer = preprocess_texts(amharic_texts)


Model Development

In [5]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

def build_model(input_vocab_size, output_vocab_size, embedding_dim, units):
    # Encoder
    encoder_inputs = Input(shape=(None,))
    encoder_embedding = Embedding(input_vocab_size, embedding_dim)(encoder_inputs)
    encoder_lstm = LSTM(units, return_state=True)
    encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = Input(shape=(None,))
    decoder_embedding = Embedding(output_vocab_size, embedding_dim)(decoder_inputs)
    decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
    decoder_dense = Dense(output_vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    # Model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model

input_vocab_size = len(english_tokenizer.word_index) + 1
output_vocab_size = len(amharic_tokenizer.word_index) + 1
embedding_dim = 256
units = 512

model = build_model(input_vocab_size, output_vocab_size, embedding_dim, units)
model.compile(optimizer='adam', loss='categorical_crossentropy')

model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 256)            2560      ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 256)            2816      ['input_2[0][0]']             
                                                                                              

 Training the Model

In [7]:
# One-hot encode the target sequences
amharic_sequences_onehot = tf.keras.utils.to_categorical(amharic_sequences[:, 1:], num_classes=output_vocab_size)

# Train the model
history = model.fit(
    [english_sequences, amharic_sequences[:, :-1]],
    amharic_sequences_onehot,
    batch_size=2,
    epochs=500,
    validation_split=0.2
)


Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

5. Evaluation

In [8]:
from nltk.translate.bleu_score import sentence_bleu

def evaluate_model(model, input_text, tokenizer, max_length):
    # Tokenize and pad input text
    sequence = tokenizer.texts_to_sequences([input_text])
    sequence = pad_sequences(sequence, maxlen=max_length, padding='post')

    # Predict translation
    prediction = model.predict([sequence, np.zeros((1, max_length))])
    predicted_sequence = prediction.argmax(axis=-1)

    # Convert sequence back to text
    reverse_tokenizer = {v: k for k, v in tokenizer.word_index.items()}
    predicted_text = ' '.join([reverse_tokenizer[idx] for idx in predicted_sequence[0] if idx != 0])
    return predicted_text

# Example usage
input_text = "Hello world"
max_length = 10
predicted_text = evaluate_model(model, input_text, english_tokenizer, max_length)
print("Predicted translation:", predicted_text)


Predicted translation: good good good


 Save and Load the Model


In [9]:
model.save('translation_model.h5')

# Save tokenizers
import pickle
with open('english_tokenizer.pkl', 'wb') as f:
    pickle.dump(english_tokenizer, f)

with open('amharic_tokenizer.pkl', 'wb') as f:
    pickle.dump(amharic_tokenizer, f)


  saving_api.save_model(


To load the model and tokenizers

In [10]:
from tensorflow.keras.models import load_model
import pickle

# Load model
model = load_model('translation_model.h5')

# Load tokenizers
with open('english_tokenizer.pkl', 'rb') as f:
    english_tokenizer = pickle.load(f)

with open('amharic_tokenizer.pkl', 'rb') as f:
    amharic_tokenizer = pickle.load(f)
