In [5]:
import pickle
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense



def build_translation_model(input_vocab_size, output_vocab_size, input_seq_length, output_seq_length, hidden_units):

    encoder_input = Input(shape=(input_seq_length,))
    decoder_input = Input(shape=(output_seq_length,))

    encoder_embedding = Embedding(input_vocab_size, hidden_units, input_length=input_seq_length)(encoder_input)
    decoder_embedding = Embedding(output_vocab_size, hidden_units, input_length=output_seq_length)(decoder_input)

    encoder_blstm = Bidirectional(LSTM(hidden_units, return_sequences=True))(encoder_embedding)

    decoder_lstm = LSTM(hidden_units, return_sequences=True)(decoder_embedding, initial_state=encoder_blstm)

    output = Dense(output_vocab_size, activation='softmax')(decoder_lstm)

    model = Model([encoder_input, decoder_input], output)

    return model

tokenized_stores = {'fi_train': [], 'hi_train': []}
for key in tokenized_stores:
    file_name = "/Users/eliezer/Documents/School/NLP_Translation/TrainS/" + str(key)[3:] + "." + str(key)[0:2]
    load = open(file_name)
    sentences = load.read().split('\n')

    for sentence in sentences:
        token_store = sentence.split(' ')
        tokenized_stores[key].append(token_store)

train_size = len(tokenized_stores['fi_train'])

fi_words = {}
hi_words = {}

for key in tokenized_stores:
    if str(key)[0] == 'f':
        for sentence in tokenized_stores[key]:
            for word in sentence:
                if word in fi_words:
                    fi_words[word] += 1
                else:
                    fi_words[word] = 1
    else:
        # Creating hi_words
        for sentence in tokenized_stores[key]:
            for word in sentence:
                if word in hi_words:
                    hi_words[word] += 1
                else:
                    hi_words[word] = 1

fi_vocab = len(fi_words)
hi_vocab = len(hi_words)
print("Number of Unique Words:")
print("> Tagalog:", str(fi_vocab))
print("> Hiligaynon:", str(hi_vocab))

t = {}
uniform = 1 / (fi_vocab * hi_vocab)
n_iters = 0
max_iters = 25

fine_tune = 1
has_converged = False

while n_iters < max_iters and not has_converged:
    has_converged = True
    max_change = -1

    n_iters += 1
    count = {}
    total = {}
    for index in range(train_size):
        s_total = {}
        for fi_word in tokenized_stores['fi_train'][index]:
            s_total[fi_word] = 0
            for hi_word in tokenized_stores['hi_train'][index]:
                if (fi_word, hi_word) not in t:
                    t[(fi_word, hi_word)] = uniform
                s_total[fi_word] += t[(fi_word, hi_word)]

        for fi_word in tokenized_stores['fi_train'][index]:
            for hi_word in tokenized_stores['hi_train'][index]:
                if (fi_word, hi_word) not in count:
                    count[(fi_word, hi_word)] = 0
                count[(fi_word, hi_word)] += (t[(fi_word, hi_word)] / s_total[fi_word])

                if hi_word not in total:
                    total[hi_word] = 0
                total[hi_word] += (t[(fi_word, hi_word)] / s_total[fi_word])

    # Update translation probabilities
    for (fi_word, hi_word), _ in count.items():
        if abs(t[(fi_word, hi_word)] - count[(fi_word, hi_word)] / total[hi_word]) > 0.005:
            has_converged = False
            max_change = max(max_change, abs(t[(fi_word, hi_word)] - count[(fi_word, hi_word)] / total[hi_word]))
        t[(fi_word, hi_word)] = count[(fi_word, hi_word)] / total[hi_word]

    print(f"Iteration {n_iters} Completed, Maximum Change: {max_change}")

def translate_sentence(sentence, source_lang, target_lang):
    translated_sentence = []
    words = sentence.split(' ')
    if source_lang == 'fi' and target_lang == 'hi':
        for word in words:
            translations = {hi_word: prob for (fi_word, hi_word), prob in t.items() if fi_word == word}
            hi_word = max(translations, key=translations.get, default=None)
            translated_sentence.append(hi_word if hi_word else word)
    elif source_lang == 'hi' and target_lang == 'fi':
        for word in words:
            translations = {fi_word: prob for (fi_word, hi_word), prob in t.items() if hi_word == word}
            fi_word = max(translations, key=translations.get, default=None)
            translated_sentence.append(fi_word if fi_word else word)
    else:
        print("Invalid language codes. Please use 'fi' for Filipino and 'hi' for Hiligaynon.")
        return ""
    return ' '.join(translated_sentence)



Number of Unique Words:
> Tagalog: 3232
> Hiligaynon: 3412
Iteration 1 Completed, Maximum Change: 0.9999999093183058
Iteration 2 Completed, Maximum Change: 0.5282816118455986
Iteration 3 Completed, Maximum Change: 0.2740332237784928
Iteration 4 Completed, Maximum Change: 0.21703835058408566
Iteration 5 Completed, Maximum Change: 0.1515512712946041
Iteration 6 Completed, Maximum Change: 0.12573496692659947
Iteration 7 Completed, Maximum Change: 0.10909316881355291
Iteration 8 Completed, Maximum Change: 0.08756514150161365
Iteration 9 Completed, Maximum Change: 0.06681065285038346
Iteration 10 Completed, Maximum Change: 0.04961487117304486
Iteration 11 Completed, Maximum Change: 0.04295371298440076
Iteration 12 Completed, Maximum Change: 0.03593964034454211
Iteration 13 Completed, Maximum Change: 0.02951721031707555
Iteration 14 Completed, Maximum Change: 0.02404611496355058
Iteration 15 Completed, Maximum Change: 0.01955710207806116
Iteration 16 Completed, Maximum Change: 0.015996756529

In [8]:
# Example usage
sentence_to_translate = input("Enter your sentence: ")
source_language = input("Enter the source language code (fi/hi): ")
target_language = input("Enter the target language code (fi/hi): ")

translated_sentence = translate_sentence(sentence_to_translate, source_language, target_language)
print("Translated Sentence:", translated_sentence)


Enter your sentence:  akin yang pagkain
Enter the source language code (fi/hi):  fi
Enter the target language code (fi/hi):  hi


Translated Sentence: magilog yang pagkaun


In [4]:
import tkinter as tk
from tkinter import *
import tkinter.font as tkFont
from tkinter import messagebox
import string

def translate_sentence_with_probabilities(sentence, source_lang, target_lang):
    translated_sentence = []
    translation_probabilities = []
    
    words = sentence.split(' ')
    
    if source_lang == 'fi' and target_lang == 'hi':
        for word in words:
            translations = {hi_word: prob for (fi_word, hi_word), prob in t.items() if fi_word == word}
            if translations:
                hi_word, max_prob = max(translations.items(), key=lambda item: item[1])
            else:
                hi_word, max_prob = None, 0  # No translation found, probability is 0
                
            translated_sentence.append(hi_word if hi_word else word)
            translation_probabilities.append(max_prob)
            
    elif source_lang == 'hi' and target_lang == 'fi':
        for word in words:
            translations = {fi_word: prob for (fi_word, hi_word), prob in t.items() if hi_word == word}
            if translations:
                fi_word, max_prob = max(translations.items(), key=lambda item: item[1])
            else:
                fi_word, max_prob = None, 0  # No translation found, probability is 0
                
            translated_sentence.append(fi_word if fi_word else word)
            translation_probabilities.append(max_prob)
            
    else:
        print("Invalid language codes. Please use 'fi' for Filipino and 'hi' for Hiligaynon.") # This is for testing
        return "", [], 0.0
    
    average_probability = sum(translation_probabilities) / len(translation_probabilities) if translation_probabilities else 0.0
    return ' '.join(translated_sentence), translation_probabilities, average_probability

def update_target_language():
    if lang_var.get() == "fi":
        source_lang = "fi"
        target_lang = "hi"
    else:
        source_lang = "hi"
        target_lang = "fi"
    #source_lang_label.config(text=f"Source Language: {source_lang.upper()}")
    #target_lang_label.config(text=f"Target Language: {target_lang.upper()}")
    return source_lang, target_lang

def translate():
    # Retrieve text from the Text widget, from start to "end-1c" to omit the last newline character added by Text widget
    sentence = entry_sentence.get("1.0", "end-1c").strip().lower()
    if not sentence:
        messagebox.showwarning("Warning", "Please enter a sentence to translate.")
        return
    if any(char in string.punctuation for char in sentence):
        messagebox.showwarning("Warning", "The sentence must not contain punctuation.")
        return
    source_lang, target_lang = update_target_language()
    translated, probabilities, avg_probability = translate_sentence_with_probabilities(sentence, source_lang, target_lang)
    result_label.config(text=translated)
    avg_prob_label.config(text=f"Average Probability: {avg_probability:.2f}")


def clear_text():
    entry_sentence.delete("1.0", tk.END)  # Corrected indices
    result_label.config(text="")
    avg_prob_label.config(text="Average Probability:")

# Create the main window
root = tk.Tk()
root.title("Statistical Machine Translation")
root.resizable(False, False)
root.geometry("1050x300")

fontobj = tkFont.Font(size=15)

left_frame = tk.Frame(root)
left_frame.pack(side=tk.LEFT, padx=10, pady=10, fill=tk.Y)

middle_frame = tk.Frame(root)
middle_frame.pack(side=tk.LEFT, padx=10, pady=10)

right_frame = tk.Frame(root)
right_frame.pack(side=tk.LEFT, padx=10, pady=10, fill=tk.Y)

tk.Label(left_frame, text="Enter your sentence:").pack()
entry_sentence = tk.Text(left_frame, width=40, font=fontobj)
entry_sentence.pack()

tk.Label(middle_frame, text="Select the source language:").pack()
lang_var = tk.StringVar(value="fi")
tk.Radiobutton(middle_frame, text="Filipino (fi)", variable=lang_var, value="fi", command=update_target_language).pack()
tk.Radiobutton(middle_frame, text="Hiligaynon (hi)", variable=lang_var, value="hi", command=update_target_language).pack()

source_lang_label = tk.Label(middle_frame)
source_lang_label.pack()
target_lang_label = tk.Label(middle_frame)
target_lang_label.pack()

translate_button = tk.Button(middle_frame, text="Translate", command=translate)
translate_button.pack(pady=10)

clear_button = tk.Button(middle_frame, text="Clear", command=clear_text)
clear_button.pack(pady=(0,10))

tk.Label(right_frame, text="Translated Sentence:").pack()
result_label = tk.Label(right_frame, text="", relief=tk.SUNKEN, width=45, anchor="nw",height=16, font=fontobj)
result_label.pack()

avg_prob_label = tk.Label(middle_frame, text="Average Probability:", relief=tk.SUNKEN, width=18, anchor="w", borderwidth=0)
avg_prob_label.pack()

root.mainloop()
