In [None]:
!pip install tensorflow transformers



In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Set directory paths
data_dir = "/content/drive/MyDrive/New"
amh_path = os.path.join(data_dir, "Amh.txt")
eng_path = os.path.join(data_dir, "Eng.txt")

# Load data
with open(amh_path, 'r', encoding='utf-8') as f:
    amh_sentences = f.readlines()

with open(eng_path, 'r', encoding='utf-8') as f:
    eng_sentences = f.readlines()

print(f"Loaded {len(amh_sentences)} Amharic sentences and {len(eng_sentences)} English sentences.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded 10892 Amharic sentences and 10892 English sentences.


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenization
def create_tokenizer(sentences):
    tokenizer = Tokenizer(filters='', oov_token="<unk>")
    tokenizer.fit_on_texts(sentences)
    return tokenizer

amh_tokenizer = create_tokenizer(amh_sentences)
eng_tokenizer = create_tokenizer(eng_sentences)

amh_vocab_size = len(amh_tokenizer.word_index) + 1
eng_vocab_size = len(eng_tokenizer.word_index) + 1

# Preprocess sentences
def preprocess_sentences(sentences, tokenizer, max_length=40):
    sequences = tokenizer.texts_to_sequences(sentences)
    return pad_sequences(sequences, maxlen=max_length, padding='post')

max_length = 45
amh_data = preprocess_sentences(amh_sentences, amh_tokenizer, max_length)
eng_data = preprocess_sentences(eng_sentences, eng_tokenizer, max_length)

print(f"Amharic data shape: {amh_data.shape}, English data shape: {eng_data.shape}")


Amharic data shape: (10892, 45), English data shape: (10892, 45)


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

# Define the Seq2Seq Model
class Seq2SeqModel(tf.keras.Model):
    def __init__(self, encoder_vocab_size, decoder_vocab_size, embedding_dim, units):
        super(Seq2SeqModel, self).__init__()
        self.encoder_embedding = Embedding(encoder_vocab_size, embedding_dim)
        self.encoder_lstm = LSTM(units, return_state=True)

        self.decoder_embedding = Embedding(decoder_vocab_size, embedding_dim)
        self.decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
        self.fc = Dense(decoder_vocab_size)

    def call(self, inputs, training=False):
        enc_input, dec_input = inputs

        # Encoder
        enc_embedded = self.encoder_embedding(enc_input)
        _, enc_h, enc_c = self.encoder_lstm(enc_embedded)

        # Decoder
        dec_embedded = self.decoder_embedding(dec_input)
        dec_output, _, _ = self.decoder_lstm(dec_embedded, initial_state=[enc_h, enc_c])
        output = self.fc(dec_output)

        return output


In [None]:
from sklearn.model_selection import KFold
from nltk.translate.bleu_score import corpus_bleu

# BLEU metric
def calculate_bleu(predictions, references):
    references = [[ref] for ref in references]
    return corpus_bleu(references, predictions)

# K-Fold Split
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

In [None]:
embedding_dim = 256
units = 512
batch_size = 32
epochs = 3
bleu_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(amh_data)):
    print(f"Training on Fold {fold + 1}/{num_folds}...")

    # Split data
    train_amh, val_amh = amh_data[train_idx], amh_data[val_idx]
    train_eng, val_eng = eng_data[train_idx], eng_data[val_idx]

    # Define the model
    model = Seq2SeqModel(amh_vocab_size, eng_vocab_size, embedding_dim, units)

    # Compile the model
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
                  loss=loss_object,
                  metrics=['accuracy'])

    # Train the model
    model.fit(
        [train_amh, train_eng[:, :-1]], train_eng[:, 1:],
        validation_data=([val_amh, val_eng[:, :-1]], val_eng[:, 1:]),
        batch_size=batch_size,
        epochs=epochs
    )

    # Evaluate on validation set
    val_predictions = model.predict([val_amh, val_eng[:, :-1]])
    val_predictions = tf.argmax(val_predictions, axis=-1).numpy()

    bleu = calculate_bleu(val_predictions, val_eng[:, 1:])
    bleu_scores.append(bleu)
    print(f"BLEU score for Fold {fold + 1}: {bleu:.4f}")

# Average BLEU score
print(f"Average BLEU score across folds: {np.mean(bleu_scores):.4f}")


Training on Fold 1/5...
Epoch 1/3
Epoch 2/3
Epoch 3/3
BLEU score for Fold 1: 0.5431
Training on Fold 2/5...
Epoch 1/3
Epoch 2/3
Epoch 3/3
BLEU score for Fold 2: 0.5428
Training on Fold 3/5...
Epoch 1/3
Epoch 2/3
Epoch 3/3
BLEU score for Fold 3: 0.5428
Training on Fold 4/5...
Epoch 1/3
Epoch 2/3
Epoch 3/3
BLEU score for Fold 4: 0.5289
Training on Fold 5/5...
Epoch 1/3
Epoch 2/3
Epoch 3/3
BLEU score for Fold 5: 0.5395


NameError: name 'np' is not defined