# Statistical Machine Translation 

In [1]:
from module.il_tl.rule_based_il import dict_il, remove_punct, tokenize, tag
from module.tl_il.rule_based_tl import dict_tl
from module.tl_il.doc_trans_tl import combine_tokens
import pandas as pd

## Initialization of the Datasets


In [2]:
dict_il_tl_lm = pd.read_json('src/json data/Ilokano to Tagalog/Example-Based/Language Model/dict_il_tl_lang_mod.json')

dict_il_tl_lm.head()

Unnamed: 0,Ilokano Structure,Tagalog Structure,Tagalog Structure Count
0,[SW],[[SW]],[10]
1,"[DT, VB, DT, NN]","[[VB, DT, NN], [DT, VB, DT, NN], [DT, NN, DT, ...","[97, 192, 37, 39, 46, 27]"
2,"[DT, NN]","[[DT, NN]]",[6140]
3,"[DT, NN, VB]","[[DT, VB, DT, NN], [VB, DT, NN], [DT, NN, VB],...","[47, 344, 443, 158, 124, 47]"
4,[CC],[[CC]],[3590]


### Putting the Structure Columns in a list

In [3]:
il_struct = dict_il_tl_lm['Ilokano Structure'].tolist()
tl_struct = dict_il_tl_lm['Tagalog Structure'].tolist()
tl_struct_count = dict_il_tl_lm['Tagalog Structure Count'].tolist()

# print(tl_struct_count[0].index(max(tl_struct_count[0])))

## Putting the SMT columns in a list

In [4]:
"""
    TF-IDF
"""
vb_il_tf_idf_list = dict_il.dict_vb['Ilokano Verb TF-IDF'].tolist()
nn_il_tf_idf_list = dict_il.dict_nn['Ilokano Noun TF-IDF'].tolist()
jj_il_tf_idf_list = dict_il.dict_jj['Ilokano Adjective TF-IDF'].tolist()
rb_il_tf_idf_list = dict_il.dict_rb['Ilokano Adverb TF-IDF'].tolist()
cc_il_tf_idf_list = dict_il.dict_cc['Ilokano Conjunction TF-IDF'].tolist()
pr_il_tf_idf_list = dict_il.dict_pr['Ilokano Preposition TF-IDF'].tolist()
dt_il_tf_idf_list = dict_il.dict_dt['Ilokano Determiner TF-IDF'].tolist()

"""
    Count Vectors
"""
vb_tl_tf_cnt_list = dict_il.dict_vb['Tagalog Verb Count'].tolist()
nn_tl_tf_cnt_list = dict_il.dict_nn['Tagalog Noun Count'].tolist()
jj_tl_tf_cnt_list = dict_il.dict_jj['Tagalog Adjective Count'].tolist()
rb_tl_tf_cnt_list = dict_il.dict_rb['Tagalog Adverb Count'].tolist()
cc_tl_tf_cnt_list = dict_il.dict_cc['Tagalog Conjunction Count'].tolist()
pr_tl_tf_cnt_list = dict_il.dict_pr['Tagalog Preposition Count'].tolist()
dt_tl_tf_cnt_list = dict_il.dict_dt['Tagalog Determiner Count'].tolist()

## Hierarchial Dependence Model

### Adding the SMT values

In [5]:
def get_sum_il(sen_poss_list, dict_source, not_in_sw, not_in_vb, not_in_nn, not_in_jj, not_in_rb, not_in_cc, not_in_pr, not_in_dt, not_tagged, sum_tf_idf_il_list):
    sp_index = 0 # sentence POS index
    
    for sen_poss in sen_poss_list:
        # loop for getting the pos structure of every sentence
        """
        sen_poss is a list of POS of a sentence
        eg. ['VB', 'DT', 'NN', 'DT', 'NN']
        """
        sen_translation = []
        
        sum_tf_idf_il = 0
        wp_index = 0 # word POS index
        
        for word_pos in sen_poss:
            word = dict_source['Tokenized'][sp_index][wp_index]
            # gets the word in every sentence
            
            # Matching Conditions    
            # 1. SW
            if word_pos == 'SW':
                """
                if the POS of the word is 'SW'
                """
                if word in dict_il.sw_il_list:
                    """
                    if the word is in the Tagalog list of single words
                    """
                    temp_index = dict_il.sw_il_list.index(word)
                    
                else:
                    not_in_sw.append(word) # for debugging purposes
                                
            # 2. SW
            elif word_pos == 'VB':
                """
                if the POS of the word is 'VB'
                """
                if word in dict_il.vb_il_list:
                    """
                    if the word is in the Tagalog list of verbs
                    """
                    temp_index = dict_il.vb_il_list.index(word)
                    sum_tf_idf_il += vb_il_tf_idf_list[temp_index]
                else:
                    not_in_vb.append(word) # for debugging purposes
            
            # 3. NN
            elif word_pos == 'NN':
                """
                if the POS of the word is 'NN'
                """
                if word in dict_il.nn_il_list:
                    """
                    if the word is in the Tagalog list of nouns
                    """
                    temp_index = dict_il.nn_il_list.index(word)
                    sum_tf_idf_il += nn_il_tf_idf_list[temp_index]
                else:
                    not_in_nn.append(word) # for debugging purposes
            
            # 4. JJ
            elif word_pos == 'JJ':
                """
                if the POS of the word is 'JJ'
                """
                if word in dict_il.jj_il_list:
                    """
                    if the word is in the Tagalog list of nouns
                    """
                    temp_index = dict_il.jj_il_list.index(word)
                    sum_tf_idf_il += jj_il_tf_idf_list[temp_index]
                else:
                    not_in_jj.append(word) # for debugging purposes
            
            # 5. RB
            elif word_pos == 'RB':
                """
                if the POS of the word is 'RB'
                """
                if word in dict_il.rb_il_list:
                    """
                    if the word is in the Tagalog list of nouns
                    """
                    temp_index = dict_il.rb_il_list.index(word)
                    sum_tf_idf_il += rb_il_tf_idf_list[temp_index]
                else:
                    not_in_rb.append(word) # for debugging purposes
                    
            # 6. CC
            elif word_pos == 'CC':
                """
                if the POS of the word is 'CC'
                """
                if word in dict_il.cc_il_list:
                    """
                    if the word is in the Tagalog list of nouns
                    """
                    temp_index = dict_il.cc_il_list.index(word)
                    sum_tf_idf_il += cc_il_tf_idf_list[temp_index]
                else:
                    not_in_cc.append(word) # for debugging purposes
                    
            # 7. PR
            elif word_pos == 'PR':
                """
                if the POS of the word is 'CC'
                """
                if word in dict_il.pr_il_list:
                    """
                    if the word is in the Tagalog list of nouns
                    """
                    temp_index = dict_il.pr_il_list.index(word)
                    sum_tf_idf_il += pr_il_tf_idf_list[temp_index]
                else:
                    not_in_pr.append(word) # for debugging purposes
            
             # 8. DT
            elif word_pos == 'DT':
                """
                if the POS of the word is 'DT'
                """
                if word in dict_il.dt_il_list:
                    """
                    if the word is in the Tagalog list of nouns
                    """
                    temp_index = dict_il.dt_il_list.index(word)
                    sum_tf_idf_il += dt_il_tf_idf_list[temp_index]
                else:
                    not_in_dt.append(word) # for debugging purposes
            
            else:
                not_tagged.append(word) # for debugging purposes
                
            wp_index += 1
        
        sum_tf_idf_il_list.append(round(sum_tf_idf_il, 5))
        sp_index += 1
        
    return sum_tf_idf_il_list

## Translating with SMT translation model

### Transform

In [6]:
def trans_lm(ngram_data):
    trans_ngram_data = []
    for ngram_sen in ngram_data:
        trans_ngram_sen = []
        
        for ngram in ngram_sen:
            if ngram in il_struct:
                temp_index = il_struct.index(ngram)
                max_count = max(tl_struct_count[temp_index])
                trans_index = tl_struct_count[temp_index].index(max_count)
                trans_ngram = tl_struct[temp_index][trans_index]
                trans_ngram_sen.append(trans_ngram)
            else:
                trans_ngram_sen.append(ngram)
                
            # np_index += 1
        
        trans_ngram_data.append(trans_ngram_sen)
        
    return trans_ngram_data
# end of function

In [7]:
il_phrases = [remove_punct(word) for word in dict_il.il_phrases]
il_phrases = [tokenize(word) for word in il_phrases]

tl_phrases = [remove_punct(word) for word in dict_tl.tl_phrases]
tl_phrases = [tokenize(word) for word in tl_phrases]


In [8]:
def inFPhrases(word, word2, word3, word4, word5, word6, word7, il_phrases):
    inFPhrases = False
    il_phrase = []
    w_used = 0
    for phrase in il_phrases:
        length = len(phrase)
        if length == 7:
            if word == phrase[0] and word2 == phrase[1] and word3 == phrase[2] and word4 == phrase[3] and word5 == phrase[4] and word6 == phrase[5] and word7 == phrase[6]:
                inFPhrases = True
                tl_phrase = phrase
                w_used = 7
                break        
        if length == 6:
            if word == phrase[0] and word2 == phrase[1] and word3 == phrase[2] and word4 == phrase[3] and word5 == phrase[4] and word6 == phrase[5]:
                inFPhrases = True
                tl_phrase = phrase
                w_used = 6
                break
        if length == 5:
            if word == phrase[0] and word2 == phrase[1] and word3 == phrase[2] and word4 == phrase[3] and word5 == phrase[4]:
                inFPhrases = True
                tl_phrase = phrase
                w_used = 5
                break
        if length == 4:
            if word == phrase[0] and word2 == phrase[1] and word3 == phrase[2] and word4 == phrase[3]:
                inFPhrases = True
                tl_phrase = phrase
                w_used = 4
                break
        if length == 3:
            if word == phrase[0] and word2 == phrase[1] and word3 == phrase[2]:
                inFPhrases = True
                il_phrase = phrase
                w_used = 3
                break
        if length == 2:
            if word == phrase[0] and word2 == phrase[1]:
                inFPhrases = True
                il_phrase = phrase
                w_used = 2
                break
        if length == 1:
            if word == phrase[0]:
                inFPhrases = True
                il_phrase = phrase
                w_used = 1
                break
                
    return inFPhrases, il_phrase, w_used
# end of function

In [9]:
from module.smt import encapsulate, ngram_var

def translate_smt(sen_poss_list, dict_source):
    not_in_sw = []
    not_in_vb = []
    not_in_nn = []
    not_in_jj = []
    not_in_rb = []
    not_in_cc = []
    not_in_pr = []
    not_in_dt = []
    not_tagged = []
    sum_tf_idf_il_list = []

    sum_tf_idf_il_list = get_sum_il(sen_poss_list, dict_source, not_in_sw, not_in_vb, not_in_nn, not_in_jj, not_in_rb, not_in_cc, not_in_pr, not_in_dt, not_tagged, sum_tf_idf_il_list)
    
    encapsulate(sen_poss_list, ngram_var.fourgram_list, ngram_var.trigram_list, ngram_var.bigram_list, ngram_var.unigram_list, ngram_var.ngram_list, ngram_var.notencap_list, ngram_var.fourgram_count_sen, ngram_var.trigram_count_sen, ngram_var.bigram_count_sen, ngram_var.unigram_count_sen, ngram_var.notencap_count_sen)
    
    ngram_data = ngram_var.ngram_list
    
    trans_ngram_data = trans_lm(ngram_data)
    
    sp_index = 0 # sentence POS index
    sen_translation_list = []
    
    for sen_poss in sen_poss_list:
        # loop for getting the pos structure of every sentence
        """
        sen_poss is a list of POS of a sentence
        eg. ['VB', 'DT', 'NN', 'DT', 'NN']
        """
        sen_translation = []
        wp_index = 0
        cur_wp_index = 0
        
        for word_pos in sen_poss:
            if wp_index == cur_wp_index:
                word = dict_source['Tokenized'][sp_index][wp_index]
                # gets the word in every sentence
                
                try: 
                    word2 = dict_source['Tokenized'][sp_index][wp_index+1]
                except:
                    word2 = None
                try:
                    word3 = dict_source['Tokenized'][sp_index][wp_index+2]
                except:
                    word3 = None
                try:
                    word4 = dict_source['Tokenized'][sp_index][wp_index+3]
                except:
                    word4 = None
                try:
                    word5 = dict_source['Tokenized'][sp_index][wp_index+4]
                except:
                    word5 = None
                try:
                    word6 = dict_source['Tokenized'][sp_index][wp_index+5]
                except:
                    word6 = None
                try:
                    word7 = dict_source['Tokenized'][sp_index][wp_index+6]
                except:
                    word7 = None
                
                ans = inFPhrases(word, word2, word3, word4, word5, word6, word7, il_phrases)
                inFPDict = ans[0]
                il_phrase = ans[1]
                w_used = ans[2]                
                
                if inFPDict and il_phrase != []:
                    """
                    if the word is in the list of Tagalog phrases
                    """
                    p_index = il_phrases.index(il_phrase)
                    tl_phrase = tl_phrases[p_index]
                    for tl_word in tl_phrase:
                        sen_translation.append(tl_word)
                    cur_wp_index = wp_index + w_used
                    
                else:
                    cur_wp_index = wp_index + 1
                
                    # Matching Conditions    
                    # 1. SW
                    if word_pos == 'SW':
                        """
                        if the POS of the word is 'SW'
                        """
                        if word in dict_il.sw_il_list:
                            temp_index = dict_il.sw_il_list.index(word)
                            if dict_il.sw_tl_list[temp_index][0] == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(dict_il.sw_tl_list[temp_index][0])
                        else:
                            sen_translation.append(word)
                    
                    # 2. VB
                    elif word_pos == 'VB':
                        """
                        if the POS of the word is 'VB'
                        """
                        if word in dict_il.vb_il_list:
                            """
                            if the word is in the Tagalog list of verbs
                            """
                            il_index = dict_il.vb_il_list.index(word)
                            max_ilidf = max(dict_il.vb_tfidf_tl_list[il_index])
                            tl_index = dict_il.vb_tfidf_tl_list[il_index].index(max_ilidf)
                            tl_word = dict_il.vb_tl_list[il_index][tl_index]
                            
                            if tl_word == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(tl_word)
                        else:
                            sen_translation.append(word)
                    
                    # 3. NN
                    elif word_pos == 'NN':
                        """
                        if the POS of the word is 'NN'
                        """
                        if word in dict_il.nn_il_list:
                            """
                            if the word is in the Tagalog list of noun
                            """
                            il_index = dict_il.nn_il_list.index(word)
                            max_ilidf = max(dict_il.nn_tfidf_tl_list[il_index])
                            tl_index = dict_il.nn_tfidf_tl_list[il_index].index(max_ilidf)
                            tl_word = dict_il.nn_tl_list[il_index][tl_index]
                            
                            if tl_word == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(tl_word)
                        else:
                            sen_translation.append(word)
                    
                    # 4. JJ
                    elif word_pos == 'JJ':
                        """
                        if the POS of the word is 'JJ'
                        """
                        if word in dict_il.jj_il_list:
                            """
                            if the word is in the Tagalog list of adjectives
                            """
                            il_index = dict_il.jj_il_list.index(word)
                            max_ilidf = max(dict_il.jj_tfidf_tl_list[il_index])
                            tl_index = dict_il.jj_tfidf_tl_list[il_index].index(max_ilidf)
                            tl_word = dict_il.jj_tl_list[il_index][tl_index]
                            
                            if tl_word == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(tl_word)
                        else:
                            sen_translation.append(word)
                            
                    # 5. RB
                    elif word_pos == 'RB':
                        """
                        if the POS of the word is 'RB'
                        """
                        if word in dict_il.rb_il_list:
                            """
                            if the word is in the Tagalog list of adverbs
                            """
                            il_index = dict_il.rb_il_list.index(word)
                            max_ilidf = max(dict_il.rb_tfidf_tl_list[il_index])
                            tl_index = dict_il.rb_tfidf_tl_list[il_index].index(max_ilidf)
                            tl_word = dict_il.rb_tl_list[il_index][tl_index]
                            
                            if tl_word == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(tl_word)
                        else:
                            sen_translation.append(word)
                    
                    # 6. CC
                    elif word_pos == 'CC':
                        """
                        if the POS of the word is 'CC'
                        """
                        if word in dict_il.cc_il_list:
                            """
                            if the word is in the Tagalog list of conjunctions
                            """
                            il_index = dict_il.cc_il_list.index(word)
                            max_ilidf = max(dict_il.cc_tfidf_tl_list[il_index])
                            tl_index = dict_il.cc_tfidf_tl_list[il_index].index(max_ilidf)
                            tl_word = dict_il.cc_tl_list[il_index][tl_index]
                            
                            if tl_word == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(tl_word)
                        else:
                            sen_translation.append(word)
                            
                    # 7. PR
                    elif word_pos == 'PR':
                        """
                        if the POS of the word is 'PR'
                        """
                        if word in dict_il.pr_il_list:
                            """
                            if the word is in the Tagalog list of prepositions
                            """
                            il_index = dict_il.pr_il_list.index(word)
                            max_ilidf = max(dict_il.pr_tfidf_tl_list[il_index])
                            tl_index = dict_il.pr_tfidf_tl_list[il_index].index(max_ilidf)
                            tl_word = dict_il.pr_tl_list[il_index][tl_index]
                            
                            if tl_word == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(tl_word)
                        else:
                            sen_translation.append(word)
                            
                    # 8. DT
                    elif word_pos == 'DT':
                        """
                        if the POS of the word is 'DT'
                        """
                        if word in dict_il.dt_il_list:
                            """
                            if the word is in the Tagalog list of determiners
                            """
                            il_index = dict_il.dt_il_list.index(word)
                            max_ilidf = max(dict_il.dt_tfidf_tl_list[il_index])
                            tl_index = dict_il.dt_tfidf_tl_list[il_index].index(max_ilidf)
                            tl_word = dict_il.dt_tl_list[il_index][tl_index]
                            
                            if tl_word == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(tl_word)
                        else:
                            sen_translation.append(word)
                    
                    else:
                        sen_translation.append(word)

            wp_index += 1
        sp_index += 1
        sen_translation_list.append(sen_translation)
    
    return sen_translation_list
# end of function

## Opening and processing the Source Document

In [10]:
def translate_smt_nb(test_doc, target_op):
    parsed_source = test_doc.split("\n")

    cleaned_source = [remove_punct(word) for word in parsed_source]
    toklenized_source = [tokenize(word) for word in cleaned_source]
    dict_source = pd.DataFrame({'Tokenized': toklenized_source})
    pos_sen_list = tag(dict_source['Tokenized'])

    dict_source['POS'] = pos_sen_list
    # sen_translation_list = translate_smt(dict_source['POS'], dict_source)
    sen_translation_list = translate_smt(dict_source['POS'], dict_source)
    temp_sen_list = combine_tokens(sen_translation_list)

    # Dictionary of the system output and the expected output and their scores
    dict_op_ex = pd.DataFrame({'Source Text': cleaned_source})

    parsed_expected_op = target_op.split("\n")
    cleaned_expected_op = [remove_punct(word) for word in parsed_expected_op]
    toklenized_expected_op = [tokenize(word) for word in cleaned_expected_op]
    combine_tokens_expected_op = combine_tokens(toklenized_expected_op)
    dict_op_ex['Target Output'] = combine_tokens_expected_op
    dict_op_ex['System Output'] = temp_sen_list

    dict_op_ex.head()
    return dict_op_ex
# end of function

## Translating Training data

In [11]:
# Opening the file
# test_doc = open("src/text data/Bible_Ilokano.txt", encoding='utf-8').read()
# target_op = open("src/text data/Bible_Tagalog.txt", encoding='utf-8').read()

# dict_train = translate_smt_nb(test_doc, target_op)

## Translation Testing data

In [12]:
# Opening the file
test_doc = open("src/text data/testing data/Ilokano/il_test_data_bible.txt", encoding='utf-8').read()
target_op = open("src/text data/testing data/Tagalog/tl_test_data_bible.txt", encoding='utf-8').read()

dict_test = translate_smt_nb(test_doc, target_op)

In [13]:
import json

# dict_op_ex_rec = dict_train.to_dict('records')

# try:
#     with open("src/json data/Ilokano to Tagalog/Hybrid Translator/dict_il-tl_op_ex.json", "w") as outfile:
#         json.dump(dict_op_ex_rec, outfile)
#     print("Successfully saved the file.")
# except: 
#     print("Error in saving the file.")

## Saving Testing result


In [14]:
dict_test_rec = dict_test.to_dict('records')

try:
    with open("src/json data/Ilokano to Tagalog/Hybrid Translator/dict_il-tl_test.json", "w") as outfile:
        json.dump(dict_test_rec, outfile)
    print("Successfully saved the file.")
except:
    print("Error in saving the file.")

Successfully saved the file.


In [15]:
from module.scoring import scoring_bleu, scoring_ter

ave_bleu = scoring_bleu(dict_op_ex)
ave_ter = scoring_ter(dict_op_ex)

NameError: name 'dict_op_ex' is not defined

In [None]:
print("Average TER Score: ", ave_ter)