# Statistical Machine Translation 

In [1]:
from module.tl_il.rule_based_tl import dict_tl, remove_punct, tokenize, tag
from module.tl_il.doc_trans_tl import combine_tokens
import pandas as pd

## Initialization of the Datasets

In [2]:
dict_tl_il_lm = pd.read_json('src/json data/Tagalog to Ilokano/Example-Based/Language Model/dict_tl_il_lang_mod.json')

dict_tl_il_lm.head()

Unnamed: 0,Tagalog Structure,Ilokano Structure,Ilokano Structure Count
0,[SW],[[SW]],[8]
1,"[VB, DT, NN]","[[DT, VB, DT, NN], [DT, NN, VB], [VB, DT, NN],...","[99, 358, 857, 40, 72, 61]"
2,"[DT, NN]","[[DT, NN]]",[5041]
3,"[DT, VB, DT, NN]","[[DT, NN, VB], [DT, VB, DT, NN], [VB, DT, NN],...","[62, 177, 68, 15, 22, 18]"
4,[CC],"[[CC], [CC, RB]]","[3704, 3]"


### Putting the Structure Columns in a list

In [3]:
tl_struct = dict_tl_il_lm['Tagalog Structure'].tolist()
il_struct = dict_tl_il_lm['Ilokano Structure'].tolist()
il_struct_count = dict_tl_il_lm['Ilokano Structure Count'].tolist()

# print(il_struct_count[0].index(max(il_struct_count[0])))

## Putting the SMT columns in a list

In [4]:
"""
    TF-IDF
"""
vb_tl_tf_idf_list = dict_tl.dict_vb['Tagalog Verb TF-IDF'].tolist()
nn_tl_tf_idf_list = dict_tl.dict_nn['Tagalog Noun TF-IDF'].tolist()
jj_tl_tf_idf_list = dict_tl.dict_jj['Tagalog Adjective TF-IDF'].tolist()
rb_tl_tf_idf_list = dict_tl.dict_rb['Tagalog Adverb TF-IDF'].tolist()
cc_tl_tf_idf_list = dict_tl.dict_cc['Tagalog Conjunction TF-IDF'].tolist()
pr_tl_tf_idf_list = dict_tl.dict_pr['Tagalog Preposition TF-IDF'].tolist()
dt_tl_tf_idf_list = dict_tl.dict_dt['Tagalog Determiner TF-IDF'].tolist()

"""
    Count Vectors
"""
vb_il_tf_cnt_list = dict_tl.dict_vb['Ilokano Verb Count'].tolist()
nn_il_tf_cnt_list = dict_tl.dict_nn['Ilokano Noun Count'].tolist()
jj_il_tf_cnt_list = dict_tl.dict_jj['Ilokano Adjective Count'].tolist()
rb_il_tf_cnt_list = dict_tl.dict_rb['Ilokano Adverb Count'].tolist()
cc_il_tf_cnt_list = dict_tl.dict_cc['Ilokano Conjunction Count'].tolist()
pr_il_tf_cnt_list = dict_tl.dict_pr['Ilokano Preposition Count'].tolist()
dt_il_tf_cnt_list = dict_tl.dict_dt['Ilokano Determiner Count'].tolist()

## Hierarchial Dependence Model

### Adding the SMT values

In [5]:
def get_sum_tl(sen_poss_list, dict_source, not_in_sw, not_in_vb, not_in_nn, not_in_jj, not_in_rb, not_in_cc, not_in_pr, not_in_dt, not_tagged, sum_tf_idf_tl_list):
    sp_index = 0 # sentence POS index
    
    for sen_poss in sen_poss_list:
        # loop for getting the pos structure of every sentence
        """
        sen_poss is a list of POS of a sentence
        eg. ['VB', 'DT', 'NN', 'DT', 'NN']
        """
        sen_translation = []
        
        sum_tf_idf_tl = 0
        wp_index = 0 # word POS index
        
        for word_pos in sen_poss:
            word = dict_source['Tokenized'][sp_index][wp_index]
            # gets the word in every sentence
            
            # Matching Conditions    
            # 1. SW
            if word_pos == 'SW':
                """
                if the POS of the word is 'SW'
                """
                if word in dict_tl.sw_tl_list:
                    """
                    if the word is in the Tagalog list of single words
                    """
                    temp_index = dict_tl.sw_tl_list.index(word)
                    
                else:
                    not_in_sw.append(word) # for debugging purposes
                                
            # 2. SW
            elif word_pos == 'VB':
                """
                if the POS of the word is 'VB'
                """
                if word in dict_tl.vb_tl_list:
                    """
                    if the word is in the Tagalog list of verbs
                    """
                    temp_index = dict_tl.vb_tl_list.index(word)
                    sum_tf_idf_tl += vb_tl_tf_idf_list[temp_index]
                else:
                    not_in_vb.append(word) # for debugging purposes
            
            # 3. NN
            elif word_pos == 'NN':
                """
                if the POS of the word is 'NN'
                """
                if word in dict_tl.nn_tl_list:
                    """
                    if the word is in the Tagalog list of nouns
                    """
                    temp_index = dict_tl.nn_tl_list.index(word)
                    sum_tf_idf_tl += nn_tl_tf_idf_list[temp_index]
                else:
                    not_in_nn.append(word) # for debugging purposes
            
            # 4. JJ
            elif word_pos == 'JJ':
                """
                if the POS of the word is 'JJ'
                """
                if word in dict_tl.jj_tl_list:
                    """
                    if the word is in the Tagalog list of nouns
                    """
                    temp_index = dict_tl.jj_tl_list.index(word)
                    sum_tf_idf_tl += jj_tl_tf_idf_list[temp_index]
                else:
                    not_in_jj.append(word) # for debugging purposes
            
            # 5. RB
            elif word_pos == 'RB':
                """
                if the POS of the word is 'RB'
                """
                if word in dict_tl.rb_tl_list:
                    """
                    if the word is in the Tagalog list of nouns
                    """
                    temp_index = dict_tl.rb_tl_list.index(word)
                    sum_tf_idf_tl += rb_tl_tf_idf_list[temp_index]
                else:
                    not_in_rb.append(word) # for debugging purposes
                    
            # 6. CC
            elif word_pos == 'CC':
                """
                if the POS of the word is 'CC'
                """
                if word in dict_tl.cc_tl_list:
                    """
                    if the word is in the Tagalog list of nouns
                    """
                    temp_index = dict_tl.cc_tl_list.index(word)
                    sum_tf_idf_tl += cc_tl_tf_idf_list[temp_index]
                else:
                    not_in_cc.append(word) # for debugging purposes
                    
            # 7. PR
            elif word_pos == 'PR':
                """
                if the POS of the word is 'CC'
                """
                if word in dict_tl.pr_tl_list:
                    """
                    if the word is in the Tagalog list of nouns
                    """
                    temp_index = dict_tl.pr_tl_list.index(word)
                    sum_tf_idf_tl += pr_tl_tf_idf_list[temp_index]
                else:
                    not_in_pr.append(word) # for debugging purposes
            
             # 8. DT
            elif word_pos == 'DT':
                """
                if the POS of the word is 'DT'
                """
                if word in dict_tl.dt_tl_list:
                    """
                    if the word is in the Tagalog list of nouns
                    """
                    temp_index = dict_tl.dt_tl_list.index(word)
                    sum_tf_idf_tl += dt_tl_tf_idf_list[temp_index]
                else:
                    not_in_dt.append(word) # for debugging purposes
            
            else:
                not_tagged.append(word) # for debugging purposes
                
            wp_index += 1
        
        sum_tf_idf_tl_list.append(round(sum_tf_idf_tl, 5))
        sp_index += 1
        
    return sum_tf_idf_tl_list

## Translating with SMT translation model

### Transform

In [6]:
def trans_lm(ngram_data):
    trans_ngram_data = []
    for ngram_sen in ngram_data:
        trans_ngram_sen = []
        
        for ngram in ngram_sen:
            if ngram in tl_struct:
                temp_index = tl_struct.index(ngram)
                max_count = max(il_struct_count[temp_index])
                trans_index = il_struct_count[temp_index].index(max_count)
                trans_ngram = il_struct[temp_index][trans_index]
                trans_ngram_sen.append(trans_ngram)
            else:
                trans_ngram_sen.append(ngram)
                
            # np_index += 1
        
        trans_ngram_data.append(trans_ngram_sen)
        
    return trans_ngram_data
# end of function

In [7]:
f_phrases = pd.read_csv('src/csv data/f_phrases.csv')
il_phrases = f_phrases['Ilokano'].to_list()
il_phrases = [remove_punct(word) for word in il_phrases]
il_phrases = [tokenize(word) for word in il_phrases]

tl_phrases = f_phrases['Tagalog'].to_list()
tl_phrases = [remove_punct(word) for word in tl_phrases]
tl_phrases = [tokenize(word) for word in tl_phrases]

In [8]:
def inFPhrases(word, word2, word3, word4, word5, word6, word7, tl_phrases):
    inFPhrases = False
    tl_phrase = []
    w_used = 0
    for phrase in tl_phrases:
        length = len(phrase)
        if length == 7:
            if word == phrase[0] and word2 == phrase[1] and word3 == phrase[2] and word4 == phrase[3] and word5 == phrase[4] and word6 == phrase[5] and word7 == phrase[6]:
                inFPhrases = True
                tl_phrase = phrase
                w_used = 7
                break
        if length == 6:
            if word == phrase[0] and word2 == phrase[1] and word3 == phrase[2] and word4 == phrase[3] and word5 == phrase[4] and word6 == phrase[5]:
                inFPhrases = True
                tl_phrase = phrase
                w_used = 6
                break
        if length == 5:
            if word == phrase[0] and word2 == phrase[1] and word3 == phrase[2] and word4 == phrase[3] and word5 == phrase[4]:
                inFPhrases = True
                tl_phrase = phrase
                w_used = 5
                break
        if length == 4:
            if word == phrase[0] and word2 == phrase[1] and word3 == phrase[2] and word4 == phrase[3]:
                inFPhrases = True
                tl_phrase = phrase
                w_used = 4
                break
        if length == 3:
            if word == phrase[0] and word2 == phrase[1] and word3 == phrase[2]:
                inFPhrases = True
                tl_phrase = phrase
                w_used = 3
                break
        if length == 2:
            if word == phrase[0] and word2 == phrase[1]:
                inFPhrases = True
                tl_phrase = phrase
                w_used = 2
                break
        if length == 1:
            if word == phrase[0]:
                inFPhrases = True
                tl_phrase = phrase
                w_used = 1
                break 
                
    return inFPhrases, tl_phrase, w_used
# end of function

In [9]:
from module.smt import encapsulate, ngram_var

def translate_smt(sen_poss_list, dict_source):
    not_in_sw = []
    not_in_vb = []
    not_in_nn = []
    not_in_jj = []
    not_in_rb = []
    not_in_cc = []
    not_in_pr = []
    not_in_dt = []
    not_tagged = []
    sum_tf_idf_tl_list = []

    sum_tf_idf_tl_list = get_sum_tl(sen_poss_list, dict_source, not_in_sw, not_in_vb, not_in_nn, not_in_jj, not_in_rb, not_in_cc, not_in_pr, not_in_dt, not_tagged, sum_tf_idf_tl_list)
    
    encapsulate(sen_poss_list, ngram_var.fourgram_list, ngram_var.trigram_list, ngram_var.bigram_list, ngram_var.unigram_list, ngram_var.ngram_list, ngram_var.notencap_list, ngram_var.fourgram_count_sen, ngram_var.trigram_count_sen, ngram_var.bigram_count_sen, ngram_var.unigram_count_sen, ngram_var.notencap_count_sen)
    
    ngram_data = ngram_var.ngram_list
    
    trans_ngram_data = trans_lm(ngram_data)
    
    sp_index = 0 # sentence POS index
    sen_translation_list = []
    
    for sen_poss in sen_poss_list:
        # loop for getting the pos structure of every sentence
        """
        sen_poss is a list of POS of a sentence
        eg. ['VB', 'DT', 'NN', 'DT', 'NN']
        """
        sen_translation = []
        wp_index = 0
        cur_wp_index = 0
        
        for word_pos in sen_poss:
            if wp_index == cur_wp_index:
                word = dict_source['Tokenized'][sp_index][wp_index]
                # gets the word in every sentence
                
                try: 
                    word2 = dict_source['Tokenized'][sp_index][wp_index+1]
                except:
                    word2 = None
                try:
                    word3 = dict_source['Tokenized'][sp_index][wp_index+2]
                except:
                    word3 = None
                try:
                    word4 = dict_source['Tokenized'][sp_index][wp_index+3]
                except:
                    word4 = None
                try:
                    word5 = dict_source['Tokenized'][sp_index][wp_index+4]
                except:
                    word5 = None
                try:
                    word6 = dict_source['Tokenized'][sp_index][wp_index+5]
                except:
                    word6 = None
                try:
                    word7 = dict_source['Tokenized'][sp_index][wp_index+6]
                except:
                    word7 = None
                
                ans = inFPhrases(word, word2, word3, word4, word5, word6, word7, tl_phrases)
                inFPDict = ans[0]
                tl_phrase = ans[1]
                w_used = ans[2]                
                
                if inFPDict:
                    """
                    if the word is in the list of Tagalog phrases
                    """
                    p_index = tl_phrases.index(tl_phrase)
                    il_phrase = il_phrases[p_index]
                    for il_word in il_phrase:
                        sen_translation.append(il_word)
                    cur_wp_index = wp_index + w_used
                    
                else:
                    cur_wp_index = wp_index + 1
                    
                    # Matching Conditions    
                    # 1. SW
                    if word_pos == 'SW':
                        """
                        if the POS of the word is 'SW'
                        """
                        if word in dict_tl.sw_tl_list:
                            temp_index = dict_tl.sw_tl_list.index(word)
                            if dict_tl.sw_il_list[temp_index][0] == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(dict_tl.sw_il_list[temp_index][0])
                        else:
                            sen_translation.append(word)
                    
                    # 2. VB
                    elif word_pos == 'VB':
                        """
                        if the POS of the word is 'VB'
                        """
                        if word in dict_tl.vb_tl_list:
                            """
                            if the word is in the Tagalog list of verbs
                            """
                            tl_index = dict_tl.vb_tl_list.index(word)
                            max_tlidf = max(dict_tl.vb_tfidf_il_list[tl_index])
                            il_index = dict_tl.vb_tfidf_il_list[tl_index].index(max_tlidf)
                            il_word = dict_tl.vb_il_list[tl_index][il_index]
                            
                            if il_word == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(il_word)
                        else:
                            sen_translation.append(word)
                    
                    # 3. NN
                    elif word_pos == 'NN':
                        """
                        if the POS of the word is 'NN'
                        """
                        if word in dict_tl.nn_tl_list:
                            """
                            if the word is in the Tagalog list of noun
                            """
                            tl_index = dict_tl.nn_tl_list.index(word)
                            max_tlidf = max(dict_tl.nn_tfidf_il_list[tl_index])
                            il_index = dict_tl.nn_tfidf_il_list[tl_index].index(max_tlidf)
                            il_word = dict_tl.nn_il_list[tl_index][il_index]
                            
                            if il_word == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(il_word)
                        else:
                            sen_translation.append(word)
                    
                    # 4. JJ
                    elif word_pos == 'JJ':
                        """
                        if the POS of the word is 'JJ'
                        """
                        if word in dict_tl.jj_tl_list:
                            """
                            if the word is in the Tagalog list of adjectives
                            """
                            tl_index = dict_tl.jj_tl_list.index(word)
                            max_tlidf = max(dict_tl.jj_tfidf_il_list[tl_index])
                            il_index = dict_tl.jj_tfidf_il_list[tl_index].index(max_tlidf)
                            il_word = dict_tl.jj_il_list[tl_index][il_index]
                            
                            if il_word == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(il_word)
                        else:
                            sen_translation.append(word)
                            
                    # 5. RB
                    elif word_pos == 'RB':
                        """
                        if the POS of the word is 'RB'
                        """
                        if word in dict_tl.rb_tl_list:
                            """
                            if the word is in the Tagalog list of adverbs
                            """
                            tl_index = dict_tl.rb_tl_list.index(word)
                            max_tlidf = max(dict_tl.rb_tfidf_il_list[tl_index])
                            il_index = dict_tl.rb_tfidf_il_list[tl_index].index(max_tlidf)
                            il_word = dict_tl.rb_il_list[tl_index][il_index]
                            
                            if il_word == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(il_word)
                        else:
                            sen_translation.append(word)
                    
                    # 6. CC
                    elif word_pos == 'CC':
                        """
                        if the POS of the word is 'CC'
                        """
                        if word in dict_tl.cc_tl_list:
                            """
                            if the word is in the Tagalog list of conjunctions
                            """
                            tl_index = dict_tl.cc_tl_list.index(word)
                            max_tlidf = max(dict_tl.cc_tfidf_il_list[tl_index])
                            il_index = dict_tl.cc_tfidf_il_list[tl_index].index(max_tlidf)
                            il_word = dict_tl.cc_il_list[tl_index][il_index]
                            
                            if il_word == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(il_word)
                        else:
                            sen_translation.append(word)
                            
                    # 7. PR
                    elif word_pos == 'PR':
                        """
                        if the POS of the word is 'PR'
                        """
                        if word in dict_tl.pr_tl_list:
                            """
                            if the word is in the Tagalog list of prepositions
                            """
                            tl_index = dict_tl.pr_tl_list.index(word)
                            max_tlidf = max(dict_tl.pr_tfidf_il_list[tl_index])
                            il_index = dict_tl.pr_tfidf_il_list[tl_index].index(max_tlidf)
                            il_word = dict_tl.pr_il_list[tl_index][il_index]
                            
                            if il_word == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(il_word)
                        else:
                            sen_translation.append(word)
                            
                    # 8. DT
                    elif word_pos == 'DT':
                        """
                        if the POS of the word is 'DT'
                        """
                        if word in dict_tl.dt_tl_list:
                            """
                            if the word is in the Tagalog list of determiners
                            """
                            tl_index = dict_tl.dt_tl_list.index(word)
                            max_tlidf = max(dict_tl.dt_tfidf_il_list[tl_index])
                            il_index = dict_tl.dt_tfidf_il_list[tl_index].index(max_tlidf)
                            il_word = dict_tl.dt_il_list[tl_index][il_index]
                            
                            if il_word == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(il_word)
                        else:
                            sen_translation.append(word)
                    
                    else:
                        sen_translation.append(word)

            wp_index += 1
        sp_index += 1
        sen_translation_list.append(sen_translation)
    
    return sen_translation_list
# end of function

## Opening and processing the Source Document

In [10]:
def translate_smt_nb(test_doc, target_op):
    parsed_source = test_doc.split("\n")

    cleaned_source = [remove_punct(word) for word in parsed_source]
    toklenized_source = [tokenize(word) for word in cleaned_source]
    dict_source = pd.DataFrame({'Tokenized': toklenized_source})
    pos_sen_list = tag(dict_source['Tokenized'])

    dict_source['POS'] = pos_sen_list
    sen_translation_list = translate_smt(dict_source['POS'], dict_source)
    temp_sen_list = combine_tokens(sen_translation_list)

    # Dictionary of the system output and the expected output and their scores
    dict_op_ex = pd.DataFrame({'Source Text':  cleaned_source})

    parsed_expected_op = target_op.split("\n")
    cleaned_expected_op = [remove_punct(word) for word in parsed_expected_op]
    toklenized_expected_op = [tokenize(word) for word in cleaned_expected_op]
    combine_tokens_expected_op = combine_tokens(toklenized_expected_op)
    dict_op_ex['Target Output'] = combine_tokens_expected_op
    dict_op_ex['System Output'] = temp_sen_list

    dict_op_ex.head()
    
    return dict_op_ex
# end of function

## Translating Training data

In [11]:
# Opening the file
# test_doc = open("src/text data/Bible_Tagalog.txt", encoding='utf-8').read()
# target_op = open("src/text data/Bible_Ilokano.txt", encoding='utf-8').read()

# dict_train = translate_smt_nb(test_doc, target_op)

## Translation Testing data

In [12]:
# Opening the file
test_doc = open("src/text data/testing data/Tagalog/tl_test_data_bible.txt", encoding='utf-8').read()
target_op = open("src/text data/testing data/Ilokano/il_test_data_bible.txt", encoding='utf-8').read()

dict_test = translate_smt_nb(test_doc, target_op)

## Saving Training result

In [13]:
import json

# dict_op_ex_rec = dict_train.to_dict('records')

# try:
#     with open("src/json data/Tagalog to Ilokano/Hybrid Translator/dict_tl-il_op_ex.json", "w") as outfile:
#         json.dump(dict_op_ex_rec, outfile)
#     print("Successfully saved the file.")
# except:
#     print("Error in saving the file.")

## Saving Testing result


In [14]:
dict_test_rec = dict_test.to_dict('records')

try:
    with open("src/json data/Tagalog to Ilokano/Hybrid Translator/dict_tl-il_test.json", "w") as outfile:
        json.dump(dict_test_rec, outfile)
    print("Successfully saved the file.")
except:
    print("Error in saving the file.")

Successfully saved the file.


In [15]:
from scoring import scoring_bleu, scoring_ter

ave_bleu = scoring_bleu(dict_op_ex)
ave_ter = scoring_ter(dict_op_ex)

ModuleNotFoundError: No module named 'scoring'

In [None]:
print("Average TER Score: ", ave_ter)