# Install necessary packages

In [1]:
!pip install tensorflow nltk



# Import necessary libraries

In [12]:
import tensorflow as tf
import numpy as np
import json
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.translate.bleu_score import sentence_bleu
import nltk

nltk.download('punkt')

# Create English sentences file
english_sentences = """
Hello world
How are you?
Good morning
Thank you
I love programming
See you later
What is your name?
Have a great day
Where is the nearest hospital?
I need help
"""
!mkdir -p /content/data
with open('/content/data/english_sentences.txt', 'w', encoding='utf-8') as f:
    f.write(english_sentences.strip())

# Create Amharic sentences file
amharic_sentences = """
ሰላም ልዑል
እንዴት ነህ?
እንኳን ደህና አደርህ
አመሰግናለሁ
እኔ ፕሮግራሚንግ እወዳለሁ
ኋላ እንገናኝ
ስምህ ማን ነው?
በጣም ጥሩ ቀን አለህ
ቅርብ ሆስፒታል ወዴት ነው?
እርዳታ ያስፈልገኛል
"""

with open('/content/data/amharic_sentences.txt', 'w', encoding='utf-8') as f:
    f.write(amharic_sentences.strip())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Data Loading

In [13]:
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read().split('\n')
    return data


# Preprocessing


In [14]:
def preprocess_data(data, max_len=None):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(data)
    sequences = tokenizer.texts_to_sequences(data)
    if not max_len:
        max_len = max([len(seq) for seq in sequences])
    padded_sequences = pad_sequences(sequences, padding='post', maxlen=max_len)
    vocab_size = len(tokenizer.word_index) + 1
    return padded_sequences, tokenizer, vocab_size, max_lensize, max_len

# Model Building

In [15]:
def preprocess_data(data, max_len=None):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(data)
    sequences = tokenizer.texts_to_sequences(data)
    if not max_len:
        max_len = max([len(seq) for seq in sequences])
    padded_sequences = pad_sequences(sequences, padding='post', maxlen=max_len)
    vocab_size = len(tokenizer.word_index) + 1
    return padded_sequences, tokenizer, vocab_size, max_len

# Load datasets

In [16]:
source_sentences = load_data('/content/data/english_sentences.txt')
target_sentences = load_data('/content/data/amharic_sentences.txt')

# Preprocess datasets

In [19]:
source_padded, source_tokenizer, source_vocab_size, source_max_len = preprocess_data(source_sentences)
target_padded, target_tokenizer, target_vocab_size, target_max_len = preprocess_data(target_sentences)

# Ensure both source and target sequences are padded to the same length
max_len = max(source_max_len, target_max_len)
source_padded = pad_sequences(source_padded, padding='post', maxlen=max_len)
target_padded = pad_sequences(target_padded, padding='post', maxlen=max_len)

# Build and train model

In [20]:
embedding_dim = 100
model = build_model(source_vocab_size, target_vocab_size, embedding_dim, source_max_len)
model.fit(source_padded, np.expand_dims(target_padded, -1), epochs=10, batch_size=32, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7bd97efa0fd0>

# Save model and tokenizers


In [21]:
model.save('translation_model.h5')
with open('source_tokenizer.json', 'w') as f:
    f.write(source_tokenizer.to_json())
with open('target_tokenizer.json', 'w') as f:
    f.write(target_tokenizer.to_json())


  saving_api.save_model(


In [23]:

def evaluate_model(model, source_tokenizer, target_tokenizer, source_sentences, target_sentences, source_max_len):
    source_sequences = source_tokenizer.texts_to_sequences(source_sentences)
    source_padded = pad_sequences(source_sequences, maxlen=source_max_len, padding='post')

    predictions = model.predict(source_padded)
    for i in range(len(source_sentences)):
        predicted_sequence = np.argmax(predictions[i], axis=-1)
        predicted_sentence = ' '.join([target_tokenizer.index_word[idx] for idx in predicted_sequence if idx != 0])
        print(f"Original: {target_sentences[i]}")
        print(f"Predicted: {predicted_sentence}")
        print(f"BLEU score: {sentence_bleu([target_sentences[i].split()], predicted_sentence.split())}")
        print()


In [24]:
evaluate_model(model, source_tokenizer, target_tokenizer, source_sentences, target_sentences, max_len)

Original: ሰላም ልዑል
Predicted: 
BLEU score: 0

Original: እንዴት ነህ?
Predicted: 
BLEU score: 0

Original: እንኳን ደህና አደርህ
Predicted: 
BLEU score: 0

Original: አመሰግናለሁ
Predicted: 
BLEU score: 0

Original: እኔ ፕሮግራሚንግ እወዳለሁ
Predicted: 
BLEU score: 0

Original: ኋላ እንገናኝ
Predicted: 
BLEU score: 0

Original: ስምህ ማን ነው?
Predicted: 
BLEU score: 0

Original: በጣም ጥሩ ቀን አለህ
Predicted: 
BLEU score: 0

Original: ቅርብ ሆስፒታል ወዴት ነው?
Predicted: 
BLEU score: 0

Original: እርዳታ ያስፈልገኛል
Predicted: 
BLEU score: 0

