<a href="https://colab.research.google.com/github/KagontleBooysen/alu-machine_learning/blob/master/Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import zipfile
import os
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd


from google.colab import drive

# Mount Google Drive7

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import tensorflow as tf
tf.test.gpu_device_name()

''

In [3]:
import tensorflow as tf

# Check if TensorFlow can see the GPU
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# Optional: List all physical devices
print(tf.config.experimental.list_physical_devices())


Num GPUs Available:  0
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]


In [4]:
import tensorflow as tf

# Create a simple operation
with tf.device('/GPU:0'):
    a = tf.constant([[1.0, 2.0], [3.0, 4.0]])
    b = tf.constant([[1.0, 1.0], [0.0, 1.0]])
    c = tf.matmul(a, b)

print(c)


tf.Tensor(
[[1. 3.]
 [3. 7.]], shape=(2, 2), dtype=float32)


In [5]:
# Extract the zip file
def extract_zip(zip_file_path, extract_to='.'):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

# Assume the zip file is named 'dataset.zip' and contains 'eng_setswana.txt'
extract_zip('/content/drive/MyDrive/english_setswana.zip',extract_to='/content/drive/MyDrive/')

In [6]:
# Set the file paths for English and Setswana sentences
eng_file_path = '/content/drive/MyDrive/english_setswana/english.txt'
sets_file_path = '/content/drive/MyDrive/english_setswana/setswana.txt'

In [7]:
# Function to load and process data
def load_and_process_data(eng_file_path, sets_file_path):
    with open(eng_file_path, 'r', encoding='utf-8') as file:
        english_sentences = file.read().splitlines()

    with open(sets_file_path, 'r', encoding='utf-8') as file:
        setswana_sentences = file.read().splitlines()

    setswana_sentences = [f'startseq {sent} endseq' for sent in setswana_sentences]

    return english_sentences, setswana_sentences

In [8]:
# Function to print the first few lines of a file
def print_file_preview(file_path, num_lines=5):
    with open(file_path, 'r', encoding='ISO-8859-1') as file:
        for _ in range(num_lines):
            print(file.readline().strip())

In [9]:
# Check the contents of the English file
print("Preview of the English file:")
print_file_preview('/content/drive/MyDrive/english_setswana/english.txt')

# Check the contents of the Setswana file
print("Preview of the Setswana file:")
print_file_preview('/content/drive/MyDrive/english_setswana/setswana.txt')

Preview of the English file:
Good progress has been made with the staff composition project in each of the 15 faculties at the NWU .
The rector , Prof Thanyani Mariba , congratulated the newcomers on their choice to further their studies at the campus and emphasised the importance of choice and responsibility - both in terms of academic commitments and social endeavours .
Complaints against Correctional Services staff , court officials and members of the South African National Defence Force .
biliary duct
What's noticeable is that the Mafikeng participants in the 2005 survey were not particularly impressed with their working environments .
Preview of the Setswana file:
Lo tla lemoga gore Thulaganyo ya Setheo ya 2012-2014 e e dirwang mo mafapheng otlhe ka tsamaiso ya ditumalano tsa go dira tiro ke ya gore YBB e fitlhelele maikemisetso a yone kgato ka kgato .
Moreketoro , Mop Thanyani Mariba , o ne a akgolela batlaboÅ¡eng tlhopho e ba e dirileng ya go tla go tswelela dithuto tsa bone mo 

In [10]:
# Load data from the two files
def load_data(file_path_english, file_path_setswana):
    english_sentences = []
    setswana_sentences = []

    with open(file_path_english, 'r', encoding='ISO-8859-1') as file_eng, open(file_path_setswana, 'r', encoding='ISO-8859-1') as file_sets:
        for eng, sets in zip(file_eng, file_sets):
            english_sentences.append(eng.strip())
            setswana_sentences.append(sets.strip())

    return english_sentences, setswana_sentences

In [11]:
# Load the data from the extracted files
english_sentences, setswana_sentences = load_data(
    '/content/drive/MyDrive/english_setswana/english.txt',
    '/content/drive/MyDrive/english_setswana/setswana.txt'
)

In [12]:
# Add startseq and endseq tokens to Setswana sentences
def add_sequence_tokens(sentences):
    return [f'startseq {sentence} endseq' for sentence in sentences]

# Add tokens to Setswana sentences
setswana_sentences = add_sequence_tokens(setswana_sentences)

In [28]:
# Tokenize the sentences
tokenizer_eng = Tokenizer()
tokenizer_eng.fit_on_texts(english_sentences)
sequences_eng = tokenizer_eng.texts_to_sequences(english_sentences)
max_len_eng = max([len(seq) for seq in sequences_eng])
padded_eng = pad_sequences(sequences_eng, maxlen=max_len_eng, padding='post')

tokenizer_setswana = Tokenizer()
tokenizer_setswana.fit_on_texts(setswana_sentences)
sequences_setswana = tokenizer_setswana.texts_to_sequences(setswana_sentences)
max_len_setswana = max([len(seq) for seq in sequences_setswana])
padded_setswana = pad_sequences(sequences_setswana, maxlen=max_len_setswana, padding='post')

In [14]:
# Define model parameters
embedding_dim = 256
units = 1024
vocab_size_eng = len(tokenizer_eng.word_index) + 1
vocab_size_setswana = len(tokenizer_setswana.word_index) + 1

In [15]:
# Build the model
encoder_inputs = Input(shape=(max_len_eng,))
enc_emb = Embedding(vocab_size_eng, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = Bidirectional(LSTM(units, return_state=True))
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(enc_emb)
state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(max_len_setswana,))
dec_emb_layer = Embedding(vocab_size_setswana, embedding_dim, mask_zero=True)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(units * 2, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_setswana, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [16]:
# Prepare the target data for training
target_data = np.expand_dims(padded_setswana, axis=-1)

In [None]:
# Train the model
model.fit([padded_eng, padded_setswana], target_data, batch_size=64, epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

In [17]:
# Function to translate sentences
def translate_sentence(sentence):
    sequence = tokenizer_eng.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_len_eng, padding='post')
    states_value = encoder_model.predict(padded_sequence)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer_setswana.word_index['startseq']

    stop_condition = False
    translated_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer_setswana.index_word.get(sampled_token_index, '')

        if sampled_word == 'endseq' or len(translated_sentence.split()) >= max_len_setswana:
            stop_condition = True
        else:
            if sampled_word != 'startseq':  # Skip 'startseq' token
                translated_sentence += ' ' + sampled_word

        # Debug print to trace the translation process
        print(f'Current translation: {translated_sentence.strip()}')

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        states_value = [h, c]

    return translated_sentence.strip()

In [18]:
# Define encoder and decoder models for inference
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(units*2,))
decoder_state_input_c = Input(shape=(units*2,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2
)

In [19]:
# Example translation
example_sentence = "The history of the world's most important social media platform, the author of the book, the book of Proverbs, the author of the book of Proverbs, the Book of Life The author of the e-mail address of the student's first class of students completed the alternaria social media freeked by the name of the author of the book 800 copies of the 182"
translated_sentence = translate_sentence(example_sentence)
print(f'Translated sentence: {translated_sentence}')

Current translation: namileng
Current translation: namileng nomination
Current translation: namileng nomination jokwe
Current translation: namileng nomination jokwe botsolotswa
Current translation: namileng nomination jokwe botsolotswa annelien
Current translation: namileng nomination jokwe botsolotswa annelien itemogetse
Current translation: namileng nomination jokwe botsolotswa annelien itemogetse tswale
Current translation: namileng nomination jokwe botsolotswa annelien itemogetse tswale pure
Current translation: namileng nomination jokwe botsolotswa annelien itemogetse tswale pure tshwaetse
Current translation: namileng nomination jokwe botsolotswa annelien itemogetse tswale pure tshwaetse wellness
Current translation: namileng nomination jokwe botsolotswa annelien itemogetse tswale pure tshwaetse wellness botlhokatsebe
Current translation: namileng nomination jokwe botsolotswa annelien itemogetse tswale pure tshwaetse wellness botlhokatsebe tekete
Current translation: namileng nom