# Statistical Machine Translation 

In [None]:
from rule_based_tl import dict_tl, remove_punct, tokenize, tag
from doc_trans_tl import combine_tokens
import pandas as pd

## Putting the SMT columns in a list

In [None]:
"""
    TF-IDF
"""
vb_tl_tf_idf_list = dict_tl.dict_vb['Tagalog Verb TF-IDF'].tolist()
nn_tl_tf_idf_list = dict_tl.dict_nn['Tagalog Noun TF-IDF'].tolist()
jj_tl_tf_idf_list = dict_tl.dict_jj['Tagalog Adjective TF-IDF'].tolist()
rb_tl_tf_idf_list = dict_tl.dict_rb['Tagalog Adverb TF-IDF'].tolist()
cc_tl_tf_idf_list = dict_tl.dict_cc['Tagalog Conjunction TF-IDF'].tolist()
pr_tl_tf_idf_list = dict_tl.dict_pr['Tagalog Preposition TF-IDF'].tolist()
dt_tl_tf_idf_list = dict_tl.dict_dt['Tagalog Determiner TF-IDF'].tolist()

"""
    Count Vectors
"""
vb_il_tf_cnt_list = dict_tl.dict_vb['Ilokano Verb Count'].tolist()
nn_il_tf_cnt_list = dict_tl.dict_nn['Ilokano Noun Count'].tolist()
jj_il_tf_cnt_list = dict_tl.dict_jj['Ilokano Adjective Count'].tolist()
rb_il_tf_cnt_list = dict_tl.dict_rb['Ilokano Adverb Count'].tolist()
cc_il_tf_cnt_list = dict_tl.dict_cc['Ilokano Conjunction Count'].tolist()
pr_il_tf_cnt_list = dict_tl.dict_pr['Ilokano Preposition Count'].tolist()
dt_il_tf_cnt_list = dict_tl.dict_dt['Ilokano Determiner Count'].tolist()

## Hierarchial Dependence Model

### Adding the SMT values

In [None]:
def get_sum_tl(sen_poss_list, dict_source, not_in_sw, not_in_vb, not_in_nn, not_in_jj, not_in_rb, not_in_cc, not_in_pr, not_in_dt, not_tagged, sum_tf_idf_tl_list):
    sp_index = 0 # sentence POS index
    
    for sen_poss in sen_poss_list:
        # loop for getting the pos structure of every sentence
        """
        sen_poss is a list of POS of a sentence
        eg. ['VB', 'DT', 'NN', 'DT', 'NN']
        """
        sen_translation = []
        
        sum_tf_idf_tl = 0
        wp_index = 0 # word POS index
        
        for word_pos in sen_poss:
            word = dict_source['Tokenized'][sp_index][wp_index]
            # gets the word in every sentence
            
            # Matching Conditions    
            # 1. SW
            if word_pos == 'SW':
                """
                if the POS of the word is 'SW'
                """
                if word in dict_tl.sw_tl_list:
                    """
                    if the word is in the Tagalog list of single words
                    """
                    temp_index = dict_tl.sw_tl_list.index(word)
                    
                else:
                    not_in_sw.append(word) # for debugging purposes
                                
            # 2. SW
            elif word_pos == 'VB':
                """
                if the POS of the word is 'VB'
                """
                if word in dict_tl.vb_tl_list:
                    """
                    if the word is in the Tagalog list of verbs
                    """
                    temp_index = dict_tl.vb_tl_list.index(word)
                    sum_tf_idf_tl += vb_tl_tf_idf_list[temp_index]
                else:
                    not_in_vb.append(word) # for debugging purposes
            
            # 3. NN
            elif word_pos == 'NN':
                """
                if the POS of the word is 'NN'
                """
                if word in dict_tl.nn_tl_list:
                    """
                    if the word is in the Tagalog list of nouns
                    """
                    temp_index = dict_tl.nn_tl_list.index(word)
                    sum_tf_idf_tl += nn_tl_tf_idf_list[temp_index]
                else:
                    not_in_nn.append(word) # for debugging purposes
            
            # 4. JJ
            elif word_pos == 'JJ':
                """
                if the POS of the word is 'JJ'
                """
                if word in dict_tl.jj_tl_list:
                    """
                    if the word is in the Tagalog list of nouns
                    """
                    temp_index = dict_tl.jj_tl_list.index(word)
                    sum_tf_idf_tl += jj_tl_tf_idf_list[temp_index]
                else:
                    not_in_jj.append(word) # for debugging purposes
            
            # 5. RB
            elif word_pos == 'RB':
                """
                if the POS of the word is 'RB'
                """
                if word in dict_tl.rb_tl_list:
                    """
                    if the word is in the Tagalog list of nouns
                    """
                    temp_index = dict_tl.rb_tl_list.index(word)
                    sum_tf_idf_tl += rb_tl_tf_idf_list[temp_index]
                else:
                    not_in_rb.append(word) # for debugging purposes
                    
            # 6. CC
            elif word_pos == 'CC':
                """
                if the POS of the word is 'CC'
                """
                if word in dict_tl.cc_tl_list:
                    """
                    if the word is in the Tagalog list of nouns
                    """
                    temp_index = dict_tl.cc_tl_list.index(word)
                    sum_tf_idf_tl += cc_tl_tf_idf_list[temp_index]
                else:
                    not_in_cc.append(word) # for debugging purposes
                    
            # 7. PR
            elif word_pos == 'PR':
                """
                if the POS of the word is 'CC'
                """
                if word in dict_tl.pr_tl_list:
                    """
                    if the word is in the Tagalog list of nouns
                    """
                    temp_index = dict_tl.pr_tl_list.index(word)
                    sum_tf_idf_tl += pr_tl_tf_idf_list[temp_index]
                else:
                    not_in_pr.append(word) # for debugging purposes
            
             # 8. DT
            elif word_pos == 'DT':
                """
                if the POS of the word is 'DT'
                """
                if word in dict_tl.dt_tl_list:
                    """
                    if the word is in the Tagalog list of nouns
                    """
                    temp_index = dict_tl.dt_tl_list.index(word)
                    sum_tf_idf_tl += dt_tl_tf_idf_list[temp_index]
                else:
                    not_in_dt.append(word) # for debugging purposes
            
            else:
                not_tagged.append(word) # for debugging purposes
                
            wp_index += 1
        
        sum_tf_idf_tl_list.append(round(sum_tf_idf_tl, 5))
        sp_index += 1
        
    return sum_tf_idf_tl_list

## Translating with SMT translation model

In [None]:
def translate_smt(sen_poss_list, dict_source):
    sen_translation_list = []
    
    not_in_sw = []
    not_in_vb = []
    not_in_nn = []
    not_in_jj = []
    not_in_rb = []
    not_in_cc = []
    not_in_pr = []
    not_in_dt = []
    not_tagged = []
    sum_tf_idf_tl_list = []

    sum_tf_idf_tl_list = get_sum_tl(sen_poss_list, dict_source, not_in_sw, not_in_vb, not_in_nn, not_in_jj, not_in_rb, not_in_cc, not_in_pr, not_in_dt, not_tagged, sum_tf_idf_tl_list)
    
    sp_index = 0 # sentence POS index
    
    """
        This is where the language model goes
    """
    
    for sen_poss in sen_poss_list:
        # loop for getting the pos structure of every sentence
        """
        sen_poss is a list of POS of a sentence
        eg. ['VB', 'DT', 'NN', 'DT', 'NN']
        """
        sen_translation = []
        
        wp_index = 0 # word POS index
        
        for word_pos in sen_poss:
            word = dict_source['Tokenized'][sp_index][wp_index]
            # gets the word in every sentence
            
            # Matching Conditions    
            # 1. SW
            if word_pos == 'SW':
                """
                if the POS of the word is 'SW'
                """
                if word in dict_tl.sw_tl_list:
                    """
                    if the word is in the Tagalog list of single words
                    """
                    temp_index = dict_tl.sw_tl_list.index(word)
                    isNone = False
                    if dict_tl.sw_il_list[temp_index][0] == 'None':
                        sen_translation.append(word)
                    else:
                        sen_translation.append(dict_tl.sw_il_list[temp_index][0])
                else:
                    sen_translation.append(word)
            
            # 2. SW
            elif word_pos == 'VB':
                """
                if the POS of the word is 'VB'
                """
                if word in dict_tl.vb_tl_list:
                    """
                    if the word is in the Tagalog list of verbs
                    """
                    temp_index = dict_tl.vb_tl_list.index(word)
                    isNone = False
                    if dict_tl.vb_il_list[temp_index][0] == 'None':
                        sen_translation.append(word)
                    else:
                        sen_translation.append(dict_tl.vb_il_list[temp_index][0])
                else:
                    sen_translation.append(word)
                    
            # 3. NN
            elif word_pos == 'NN':
                """
                if the POS of the word is 'NN'
                """
                if word in dict_tl.nn_tl_list:
                    """
                    if the word is in the Tagalog list of nouns
                    """
                    temp_index = dict_tl.nn_tl_list.index(word)
                    isNone = False
                    if dict_tl.nn_il_list[temp_index][0] == 'None':
                        sen_translation.append(word)
                    else:
                        sen_translation.append(dict_tl.nn_il_list[temp_index][0])
                else:
                    sen_translation.append(word)
                                
            # 4. JJ
            elif word_pos == 'JJ':
                """
                if the POS of the word is 'JJ'
                """
                if word in dict_tl.jj_tl_list:
                    """
                    if the word is in the Tagalog list of nouns
                    """
                    temp_index = dict_tl.jj_tl_list.index(word)
                    isNone = False
                    if dict_tl.jj_il_list[temp_index][0] == 'None':
                        sen_translation.append(word)
                    else:
                        sen_translation.append(dict_tl.jj_il_list[temp_index][0])
                else:
                    sen_translation.append(word)
                            
            # 5. RB
            elif word_pos == 'RB':
                """
                if the POS of the word is 'RB'
                """
                if word in dict_tl.rb_tl_list:
                    """
                    if the word is in the Tagalog list of nouns
                    """
                    temp_index = dict_tl.rb_tl_list.index(word)
                    isNone = False
                    if dict_tl.rb_il_list[temp_index][0] == 'None':
                        sen_translation.append(word)
                    else:
                        sen_translation.append(dict_tl.rb_il_list[temp_index][0])
                else:
                    sen_translation.append(word)
                    
            # 6. CC
            elif word_pos == 'CC':
                """
                if the POS of the word is 'CC'
                """
                if word in dict_tl.cc_tl_list:
                    """
                    if the word is in the Tagalog list of nouns
                    """
                    temp_index = dict_tl.cc_tl_list.index(word)
                    isNone = False
                    if dict_tl.cc_il_list[temp_index][0] == 'None':
                        sen_translation.append(word)
                    else:
                        sen_translation.append(dict_tl.cc_il_list[temp_index][0])
                else:
                    sen_translation.append(word)
                            
            # 7. PR
            elif word_pos == 'PR':
                """
                if the POS of the word is 'CC'
                """
                if word in dict_tl.pr_tl_list:
                    """
                    if the word is in the Tagalog list of nouns
                    """
                    temp_index = dict_tl.pr_tl_list.index(word)
                    if dict_tl.pr_il_list[temp_index][0] == 'None':
                        sen_translation.append(word)
                    else:
                        sen_translation.append(dict_tl.pr_il_list[temp_index][0])
                else:
                    sen_translation.append(word)
                    
            # 8. DT
            elif word_pos == 'DT':
                """
                if the POS of the word is 'DT'
                """
                if word in dict_tl.dt_tl_list:
                    """
                    if the word is in the Tagalog list of nouns
                    """
                    temp_index = dict_tl.dt_tl_list.index(word)
                    if dict_tl.dt_il_list[temp_index][0] == 'None':
                        sen_translation.append(word)
                    else:
                        sen_translation.append(dict_tl.dt_il_list[temp_index][0])
                else:
                    sen_translation.append(word)
                    
            else:
                sen_translation.append(word)
            
            wp_index += 1
        sp_index += 1
        sen_translation_list.append(sen_translation)
    
    
    print(sum_tf_idf_tl_list)        
# end of function

## Opening and processing the Source Document

In [None]:
# Opening the file
test_doc = open("src/text data/Story_Tagalog.txt", encoding='utf-8').read()
target_op = open("src/text data/Story_Ilokano.txt", encoding='utf-8').read()

parsed_source = test_doc.split("\n")
cleaned_source = [remove_punct(word) for word in parsed_source]
toklenized_source = [tokenize(word) for word in cleaned_source]
dict_source = pd.DataFrame({'Tokenized': toklenized_source})
pos_sen_list = tag(dict_source['Tokenized'])

dict_source['POS'] = pos_sen_list
# sen_translation_list = translate_smt(dict_source['POS'], dict_source)
translate_smt(dict_source['POS'], dict_source)