# Example-Based Model

## Initialization of the Datasets


### Source Dataset

#### Storing the Tagalog Part of Speech Data Set

In [1]:
import pandas as pd

# Read the tagalog POS dataset
tl_pos_data = pd.read_json('src/json data/tl_pos.json')

tl_doc_len = len(tl_pos_data)

print('Number of documents in the dataset: {}'.format(tl_doc_len))

Number of documents in the dataset: 12439


#### Storing the Ilokano Part of Speech Data Set

In [2]:
# Read the ilokano POS dataset
il_pos_data = pd.read_json('src/json data/il_pos.json')

il_pos_data.head()

Unnamed: 0,Sentence,Tokenized,Single Word,Determiner,Conjunction,Verb,Noun,Adjective,Adverb,Preposition,Unknown,POS
0,GENESIS,[genesis],[genesis],[],[],[],[],[],[],[],[],[SW]
1,Ti Aramid ti Dios a Namarsua.,"[ti, aramid, ti, dios, a, namarsua]",[],"[ti, ti, a]",[],[aramid],"[dios, namarsua]",[],[],[],[],"[DT, VB, DT, NN, DT, NN]"
2,Idi punganay ti Dios pinarsuana dagiti langlan...,"[idi, punganay, ti, dios, pinarsuana, dagiti, ...",[],"[idi, ti, dagiti, ken, ti]",[],[pinarsuana],"[dios, langlangit, daga]",[],[punganay],[],[],"[DT, RB, DT, NN, VB, DT, NN, DT, DT, NN]"
3,Ket ti daga adda idi a gosogoso ken awan nagya...,"[ket, ti, daga, adda, idi, a, gosogoso, ken, a...",[],"[ti, idi, a, ken, dagiti, idi, iti, ti, ti, ti...","[ket, ket, ket]",[],"[daga, gosogoso, awan, sipnget, yuyeng, espiri...",[],"[adda, nagyan, addada, rabao, nagtignay, rabao...",[],[],"[CC, DT, NN, RB, DT, DT, NN, DT, NN, RB, CC, D..."
4,"Ket ti Dios kinunana, Adda coma lawag: ket nag...","[ket, ti, dios, kinunana, adda, coma, lawag, k...",[],"[ti, coma, ti]","[ket, ket]","[kinunana, adda, nagadda]","[dios, lawag, lawag]",[],[],[],[],"[CC, DT, NN, VB, VB, DT, NN, CC, VB, DT, NN]"


#### Storing the Tagalog Part of Speech Structure

In [3]:
dict_sen_poss = pd.DataFrame(tl_pos_data['POS'])

dict_sen_poss.columns = ['Tagalog POS']

#### Storing the Ilokano Part of Speech 

In [4]:
dict_sen_poss['Ilokano POS'] = il_pos_data['POS']

dict_sen_poss.head()

Unnamed: 0,Tagalog POS,Ilokano POS
0,[SW],[SW]
1,"[VB, DT, NN, DT, NN]","[DT, VB, DT, NN, DT, NN]"
2,"[DT, RB, DT, VB, DT, NN, DT, NN, CC, DT, NN]","[DT, RB, DT, NN, VB, DT, NN, DT, DT, NN]"
3,"[CC, DT, NN, DT, JJ, NN, CC, JJ, NN, CC, DT, N...","[CC, DT, NN, RB, DT, DT, NN, DT, NN, RB, CC, D..."
4,"[CC, VB, DT, NN, VB, DT, NN, CC, VB, DT, NN]","[CC, DT, NN, VB, VB, DT, NN, CC, VB, DT, NN]"


### Verb Tagalog to Ilokano Dictionary

In [5]:
tl_sen_poss_list = dict_sen_poss['Tagalog POS']
il_sen_poss_list = dict_sen_poss['Ilokano POS']
"""
putting the POS of the sentences in a list object
"""

dict_tl_il_sw = pd.DataFrame(columns=['Tagalog Single Words', 'Ilokano Single Words'])
dict_tl_il_vb = pd.DataFrame(columns=['Tagalog Verb', 'Ilokano Verb'])
dict_tl_il_nn = pd.DataFrame(columns=['Tagalog Noun', 'Ilokano Noun'])
dict_tl_il_jj = pd.DataFrame(columns=['Tagalog Adjective', 'Ilokano Adjective'])
dict_tl_il_rb = pd.DataFrame(columns=['Tagalog Adverb', 'Ilokano Adverb'])
dict_tl_il_cc = pd.DataFrame(columns=['Tagalog Conjunction', 'Ilokano Conjunction'])
dict_tl_il_pr = pd.DataFrame(columns=['Tagalog Preposition', 'Ilokano Preposition'])
dict_tl_il_dt = pd.DataFrame(columns=['Tagalog Determiner', 'Ilokano Determiner'])


## Appending in the List

### For Verb List

In [6]:
def append_vb_list(tl_verb, tl_verb_list, il_verb, il_verb_list, curr_il_pos, next_il_pos, next2_il_pos, matched, sp_index, wp_index, il_verb_count_list, il_verb_count, tl_verb_count_list, tl_verb_sen):
    if tl_verb not in tl_verb_list:
        """
        if the verb is not in the list
        """
        tl_verb_list.append(tl_verb)
        
        if tl_verb not in tl_verb_sen:
            tl_verb_sen.append(tl_verb)
            tl_verb_count_list.append(1)
            
        inDict = False
        
    else:
        """
        if the verb is in the list
        """
        temp_index = tl_verb_list.index(tl_verb)
        
        if tl_verb not in tl_verb_sen:
            tl_verb_sen.append(tl_verb)
            tl_verb_count_list[temp_index] += 1
        
        inDict = True

    """
    append the the verb in the tagalog verb
    """

    if curr_il_pos == 'VB' and wp_index not in matched:
        """
        if VB : VB
        if the Ilokano POS is a verb
        """
        temp_verb = il_pos_data['Tokenized'][sp_index][wp_index]
        il_verb.append(temp_verb)
        matched.append(wp_index)
        
    elif curr_il_pos == 'DT' and next_il_pos == 'VB' and wp_index not in matched:    
        """
        if VB : DT VB
        if the Ilokano POS is a determiner and the next POS is a verb
        eg. Nilalang : ti Aramid
        """
        temp_curr_verb = il_pos_data['Tokenized'][sp_index][wp_index]
        temp_next_verb = il_pos_data['Tokenized'][sp_index][wp_index + 1]
        temp_verb = temp_curr_verb + ' ' + temp_next_verb
        il_verb.append(temp_verb)
        matched.append(wp_index)
        matched.append(wp_index + 1) 

    elif curr_il_pos == 'NN' and next_il_pos == 'VB' and wp_index not in matched:    
        """
        if VB : NN VB
        if the Ilokano POS is a determiner and the next POS is a verb
        """
        temp_verb = il_pos_data['Tokenized'][sp_index][wp_index + 1]
        il_verb.append(temp_verb)
        matched.append(wp_index + 1)
        
    elif curr_il_pos == 'DT' and next_il_pos == 'NN' and next2_il_pos == 'VB' and wp_index not in matched:
        """
        if VB DT NN : DT NN VB
        if the Ilokano POS is a determiner and the next POS is a verb
        """
        temp_verb = il_pos_data['Tokenized'][sp_index][wp_index + 2]
        il_verb.append(temp_verb)
        matched.append(wp_index + 2)
        
    else:
        """
        if VB : Other POS
        if the Ilokano POS is not a verb
        """
        il_verb.append('None')
        matched.append(wp_index)

    if not inDict:
        il_verb_list.append(il_verb)
        il_verb_count.append(1)
        il_verb_count_list.append(il_verb_count)
    else:
        if il_verb[0] not in il_verb_list[temp_index]:
            il_verb_list[temp_index].append(il_verb[0])
            il_verb_count_list[temp_index].append(1)
        else:
            temp_verb_index = il_verb_list[temp_index].index(il_verb[0])
            il_verb_count_list[temp_index][temp_verb_index] += 1

### For Noun List

In [7]:
def append_nn_list(tl_noun, tl_noun_list, il_noun, il_noun_list, curr_il_pos, next_il_pos, matched, sp_index, wp_index, il_noun_count_list, il_noun_count, tl_noun_count_list, tl_noun_sen):
    if tl_noun not in tl_noun_list:
        """
        if the noun is not in the list
        """
        tl_noun_list.append(tl_noun)
        
        if tl_noun not in tl_noun_sen:
            tl_noun_sen.append(tl_noun)
            tl_noun_count_list.append(1)
            
        inDict = False
        
    else:
        """
        if the noun is in the list
        """
        temp_index = tl_noun_list.index(tl_noun)
        
        if tl_noun not in tl_noun_sen:
            tl_noun_sen.append(tl_noun)
            tl_noun_count_list[temp_index] += 1
        
        inDict = True

    """
    append the the noun in the tagalog noun
    """

    if curr_il_pos == 'NN':
        """
        if NN : NN
        if the Ilokano POS is a noun
        """
        temp_noun = il_pos_data['Tokenized'][sp_index][wp_index]
        il_noun.append(temp_noun)
        matched.append(wp_index)
        
        
    elif curr_il_pos == 'DT' and next_il_pos == 'NN':    
        """
        if NN : DT NN
        if the Ilokano POS is a determiner and the next POS is a noun
        """
        temp_curr_noun = il_pos_data['Tokenized'][sp_index][wp_index]
        temp_next_noun = il_pos_data['Tokenized'][sp_index][wp_index + 1]
        temp_noun = temp_curr_noun + ' ' + temp_next_noun
        il_noun.append(temp_noun) 
        matched.append(wp_index + 1)
          
    else:
        """
        if NN : Other POS
        if the Ilokano POS is not a noun
        """
        il_noun.append('None')
        matched.append(wp_index)

    if not inDict:
        il_noun_list.append(il_noun)
        il_noun_count.append(1)
        il_noun_count_list.append(il_noun_count)
    else:
        if il_noun[0] not in il_noun_list[temp_index]:
            il_noun_list[temp_index].append(il_noun[0])
            il_noun_count_list[temp_index].append(1)
        else:
            temp_noun_index = il_noun_list[temp_index].index(il_noun[0])
            il_noun_count_list[temp_index][temp_noun_index] += 1

### For Adjective List

In [8]:
def append_jj_list(tl_adj, tl_adj_list, il_adj, il_adj_list, curr_il_pos, next_il_pos, next2_il_pos, matched, sp_index, wp_index, il_adj_count_list, il_adj_count, tl_adj_count_list, tl_adj_sen):
    if tl_adj not in tl_adj_list:
        """
        if the adj is not in the list
        """
        tl_adj_list.append(tl_adj)
        
        if tl_adj not in tl_adj_sen:
            tl_adj_sen.append(tl_adj)
            tl_adj_count_list.append(1)
        
        inDict = False
        
    else:
        """
        if the adj is in the list
        """
        temp_index = tl_adj_list.index(tl_adj)
        
        if tl_adj not in tl_adj_sen:
            tl_adj_sen.append(tl_adj)
            tl_adj_count_list[temp_index] += 1
        
        inDict = True

    """
    append the the adj in the tagalog adj
    """

    if curr_il_pos == 'JJ':
        """
        if JJ : JJ
        if the Ilokano POS is an adj
        """
        temp_adj = il_pos_data['Tokenized'][sp_index][wp_index]
        il_adj.append(temp_adj)
        matched.append(wp_index)
        
    elif curr_il_pos == 'DT' and next_il_pos == 'JJ':    
        """
        if JJ : DT JJ
        if the Ilokano POS is a determiner and the next POS is an adj
        eg. mabubuting : ken naimbag
        """
        temp_curr_adj = il_pos_data['Tokenized'][sp_index][wp_index]
        temp_next_adj = il_pos_data['Tokenized'][sp_index][wp_index + 1]
        temp_adj = temp_curr_adj + ' ' + temp_next_adj
        il_adj.append(temp_adj) 
        matched.append(wp_index + 1)

    else:
        """
        if JJ : Other POS
        if the Ilokano POS is not an adj
        """
        il_adj.append('None')
        matched.append(wp_index)

    if not inDict:
        il_adj_list.append(il_adj)
        il_adj_count.append(1)
        il_adj_count_list.append(il_adj_count)
    else:
        if il_adj[0] not in il_adj_list[temp_index]:
            il_adj_list[temp_index].append(il_adj[0])
            il_adj_count_list[temp_index].append(1)
        else:
            temp_adj_index = il_adj_list[temp_index].index(il_adj[0])
            il_adj_count_list[temp_index][temp_adj_index] += 1

### For Adverb List


In [9]:
def append_rb_list(tl_adv, tl_adv_list, il_adv, il_adv_list, curr_il_pos, next_il_pos, next2_il_pos, next3_il_pos, prev_il_pos, matched, sp_index, wp_index, il_adv_count_list, il_adv_count, tl_adv_count_list, tl_adv_sen):
    if tl_adv not in tl_adv_list:
        """
        if the adverb is not in the list
        """
        tl_adv_list.append(tl_adv)
        
        if tl_adv_list not in tl_adv_sen:
            tl_adv_sen.append(tl_adv)
            tl_adv_count_list.append(1)
        
        inDict = False
        
    else:
        """
        if the adverb is in the list
        """
        temp_index = tl_adv_list.index(tl_adv)
        
        if tl_adv_list not in tl_adv_sen:
            tl_adv_sen.append(tl_adv)
            tl_adv_count_list[temp_index] += 1
            
        inDict = True

    """
    append the the verb in the tagalog verb
    """

    if curr_il_pos == 'RB':
        """
        if RB : RB
        if the Ilokano POS is a adverb
        """
        temp_adverb = il_pos_data['Tokenized'][sp_index][wp_index]
        il_adv.append(temp_adverb)
        matched.append(wp_index)
        
    elif curr_il_pos == 'DT' and next_il_pos == 'RB':    
        """
        if RB : DT RB
        """
        temp_curr_adverb = il_pos_data['Tokenized'][sp_index][wp_index]
        temp_next_adverb = il_pos_data['Tokenized'][sp_index][wp_index + 1]
        temp_adverb = temp_curr_adverb + ' ' + temp_next_adverb
        il_adv.append(temp_adverb) 
        matched.append(wp_index + 1)
  
        
    elif curr_il_pos == 'DT' and next_il_pos == 'NN' and next2_il_pos == 'DT' and next3_il_pos == 'RB' :  
        """
        if RB : DT NN DT RB
        
        """
        temp_adverb = il_pos_data['Tokenized'][sp_index][wp_index + 3]
        il_adv.append(temp_adverb)
        matched.append(wp_index + 3)
        
    elif curr_il_pos == 'DT' and prev_il_pos == 'RB':
        """
        if RB : DT with RB behind DT
        
        """
        temp_adverb = il_pos_data['Tokenized'][sp_index][wp_index - 1]
        il_adv.append(temp_adverb)
        matched.append(wp_index - 1)
        
    else:
        """
        if RB : Other POS
        if the Ilokano POS is not a adverb
        """
        il_adv.append('None')
        matched.append(wp_index)

    if not inDict:
        il_adv_list.append(il_adv)
        il_adv_count.append(1)
        il_adv_count_list.append(il_adv_count)
    else:
        if il_adv[0] not in il_adv_list[temp_index]:
            il_adv_list[temp_index].append(il_adv[0])
            il_adv_count_list[temp_index].append(1)
        else:
            temp_adj_index = il_adv_list[temp_index].index(il_adv[0])
            il_adv_count_list[temp_index][temp_adj_index] += 1

### For Conjunction List

In [10]:
def append_cc_list(tl_conj, tl_conj_list, il_conj, il_conj_list, curr_il_pos, matched, sp_index, wp_index, il_conj_count_list, il_conj_count, tl_conj_count_list, tl_conj_sen):
    if tl_conj not in tl_conj_list:
        """
        if the conj is not in the list
        """
        tl_conj_list.append(tl_conj)
        
        if tl_conj not in tl_conj_sen:
            tl_conj_sen.append(tl_conj)
            tl_conj_count_list.append(1)
        
        inDict = False
        
    else:
        """
        if the conj is in the list
        """
        temp_index = tl_conj_list.index(tl_conj)
        
        if tl_conj not in tl_conj_sen:
            tl_conj_sen.append(tl_conj)
            tl_conj_count_list[temp_index] += 1
        
        inDict = True

    """
    append the the conj in the tagalog conj
    """

    if curr_il_pos == 'CC':
        """
        if CC : CC
        if the Ilokano POS is a conj
        """
        temp_conj = il_pos_data['Tokenized'][sp_index][wp_index]
        il_conj.append(temp_conj)
        matched.append(wp_index)
          
    else:
        """
        if CC : Other POS
        if the Ilokano POS is not a conj
        """
        il_conj.append('None')
        matched.append(wp_index)

    if not inDict:
        il_conj_list.append(il_conj)
        il_conj_count.append(1)
        il_conj_count_list.append(il_conj_count)
    else:
        if il_conj[0] not in il_conj_list[temp_index]:
            il_conj_list[temp_index].append(il_conj[0])
            il_conj_count_list[temp_index].append(1)
        else:
            temp_conj_index = il_conj_list[temp_index].index(il_conj[0])
            il_conj_count_list[temp_index][temp_conj_index] += 1

### For Preposition List

In [23]:
def append_pr_list(tl_prepo, tl_prepo_list, il_prepo, il_prepo_list, curr_il_pos, next_il_pos, next2_il_pos, matched, sp_index, wp_index, il_prepo_count_list, il_prepo_count, tl_prepo_count_list, tl_prepo_sen):
    if tl_prepo not in tl_prepo_list:
        """
        if the preposition is not in the list
        """
        tl_prepo_list.append(tl_prepo)
        
        if tl_prepo not in tl_prepo_sen:
            tl_prepo_sen.append(tl_prepo)
            tl_prepo_count_list.append(1)
                
        inDict = False
    
    else:
        """
        if the preposition is in the list
        """
        temp_index = tl_prepo_list.index(tl_prepo)   
        
        if tl_prepo not in tl_prepo_sen:
            tl_prepo_sen.append(tl_prepo)
            tl_prepo_count_list[temp_index] += 1
             
        inDict = True
    
    """
    append the the verb in the tagalog verb
    """
    
    if curr_il_pos == 'PR' and wp_index not in matched:
        """
        if PR : PR
        if the Ilokano POS is a verb
        """
        temp_prepo = il_pos_data['Tokenized'][sp_index][wp_index]
        il_prepo.append(temp_prepo)
        matched.append(wp_index)
    
    else:
        """
        if VB : Other POS
        if the Ilokano POS is not a verb
        """
        il_prepo.append('None')
        matched.append(wp_index)
    
    if not inDict:
        il_prepo_list.append(il_prepo)
        il_prepo_count.append(1)
        il_prepo_count_list.append(il_prepo_count)
    else:
        if il_prepo[0] not in il_prepo_list[temp_index]:
            il_prepo_list[temp_index].append(il_prepo[0])
            il_prepo_count_list[temp_index].append(1)
        else:
            temp_prepo_index = il_prepo_list[temp_index].index(il_prepo[0])
            il_prepo_count_list[temp_index][temp_prepo_index] += 1
    

### For Determiner List

In [25]:
def append_dt_list(tl_dt, tl_dt_list, il_dt, il_dt_list, curr_il_pos, next_il_pos, next2_il_pos, matched, sp_index, wp_index, il_dt_count_list, il_dt_count, tl_dt_count_list, tl_dt_sen):
    if tl_dt not in tl_dt_list:
        """
        if the preposition is not in the list
        """
        tl_dt_list.append(tl_dt)
        
        if tl_dt not in tl_dt_sen:
            tl_dt_sen.append(tl_dt)
            tl_dt_count_list.append(1)
        
        inDict = False
    
    else:
        """
        if the preposition is in the list
        """
        temp_index = tl_dt_list.index(tl_dt)
        
        if tl_dt not in tl_dt_sen:
            tl_dt_sen.append(tl_dt)
            tl_dt_count_list[temp_index] += 1
        
        inDict = True
    
    """
    append the the verb in the tagalog verb
    """
    
    if curr_il_pos == 'DT' and wp_index not in matched:
        """
        if PR : PR
        if the Ilokano POS is a verb
        """
        temp_prepo = il_pos_data['Tokenized'][sp_index][wp_index]
        il_dt.append(temp_prepo)
        matched.append(wp_index)
    
    else:
        """
        if VB : Other POS
        if the Ilokano POS is not a verb
        """
        il_dt.append('None')
        matched.append(wp_index)
        
    if not inDict:
        il_dt_list.append(il_dt)
        il_dt_count.append(1)
        il_dt_count_list.append(il_dt_count)
    else:
        if il_dt[0] not in il_dt_list[temp_index]:
            il_dt_list[temp_index].append(il_dt[0])
            il_dt_count_list[temp_index].append(1)
        else:
            temp_dt_index = il_dt_list[temp_index].index(il_dt[0])
            il_dt_count_list[temp_index][temp_dt_index] += 1
    

In [13]:
import math

def get_idf(tl_count_list):
    tl_idf = []
    for tl_count in tl_count_list:
        temp_quo = tl_doc_len/tl_count
        tl_idf.append(abs(math.log10(temp_quo)))
        
    return tl_idf
# end of get_idf

## Tagalog to Ilokano Matcher

In [26]:
import math

wp_index = None # word position index

"""
instantiating the verb lists
"""

def match_tl_il_pos():
    """
    This function matches the POS of the sentences in the Tagalog and Ilokano datasets
    """
    tl_sw_list = []
    il_sw_list = []
    tl_verb_list = []
    il_verb_list = []
    tl_noun_list = []
    il_noun_list = []
    tl_adj_list = []
    il_adj_list = []
    tl_adv_list = []
    il_adv_list = []
    tl_conj_list = []
    il_conj_list = []
    tl_prepo_list = []
    il_prepo_list = []
    tl_dt_list = []
    il_dt_list = []
    tl_to_il_verb_list = []
    sp_index = 0
    """
    instantiating the verb lists
    """
    
    il_verb_count_list = []
    il_noun_count_list = []
    il_adj_count_list = []
    il_adv_count_list = []
    il_conj_count_list = []
    il_prepo_count_list = []
    il_dt_count_list = []
    
    tl_verb_count_list = []
    tl_noun_count_list = []
    tl_adj_count_list = []
    tl_adv_count_list = []
    tl_conj_count_list = []
    tl_prepo_count_list = []
    tl_dt_count_list = []
    
    for tl_sen_pos in tl_sen_poss_list:
        # loop for getting the pos structure of every sentence
        """
        tl_sen is a list of POS of a sentence
        eg. ['VB', 'DT', 'NN', 'DT', 'NN']
        """
        matched = []
        il_sen = il_sen_poss_list[sp_index]
    
        wp_index = 0
        """
        instantiating the variables
        """
        
        tl_verb_sen = []
        tl_noun_sen = []
        tl_adj_sen = []
        tl_adv_sen = []
        tl_conj_sen = []
        tl_prepo_sen = []
        tl_dt_sen = []
        
        for tl_word_pos in tl_sen_pos:
            # loop for each pos in a sentence
            """
            tl_word_pos is a POS of a word
            eg. 'VB'
            """
            
            il_verb = []
            il_noun = []
            il_adj = []
            il_adv = []
            il_conj = []
            il_prepo = []
            il_dt = []
            
            il_verb_count = []
            il_noun_count = []
            il_adj_count = []
            il_adv_count = []
            il_conj_count = []
            il_prepo_count = []
            il_dt_count = []
            
            tl_word = tl_pos_data['Tokenized'][sp_index][wp_index]
            # gets the word in every sentence
            
            try:
                curr_il_pos = il_sen[wp_index] # ti
            except IndexError:
                curr_il_pos = 'None'
            try:
                next_il_pos = il_sen[wp_index + 1]
            except IndexError:
                next_il_pos = 'None'
            try:
                next2_il_pos = il_sen[wp_index + 2]
            except IndexError:
                next2_il_pos = 'None'
            try:
                next3_il_pos = il_sen[wp_index + 3]
            except IndexError:
                next3_il_pos = 'None'
            try:
                prev_il_pos = il_sen[wp_index - 1]
                if (wp_index - 1) < 0:
                    prev_il_pos = 'None'
            except IndexError:
                prev_il_pos = 'None'
            """
            getting the current, next, and previous POS in the sentence
            """
            
            # Matching Conditions
            
            # 1. SW
            if tl_word_pos == 'SW':
                """
                if SW : SW
                if the Tagalog POS is a SW
                """
                il_word = il_pos_data['Tokenized'][sp_index]
                tl_sw_list.append(tl_word)
                il_sw_list.append(il_word)
            
            # 2. VB
            if tl_word_pos == 'VB':
                """
                Verb Matching
                if the POS is a verb, append the index of the verb to the verb list
                """
                
                append_vb_list(tl_word, tl_verb_list, il_verb, il_verb_list, curr_il_pos, next_il_pos, next2_il_pos, matched, sp_index, wp_index, il_verb_count_list, il_verb_count, tl_verb_count_list, tl_verb_sen)
                
            # 3. NN
            if tl_word_pos == 'NN':
                """
                Noun Matching
                if the POS is a noun, append the index of the noun to the noun list
                """
                append_nn_list(tl_word, tl_noun_list, il_noun, il_noun_list, curr_il_pos, next_il_pos, matched, sp_index, wp_index, il_noun_count_list, il_noun_count, tl_noun_count_list, tl_noun_sen)

            # 4. JJ
            if tl_word_pos == 'JJ':
                """
                Adj Matching
                if the POS is a adj, append the index of the adj to the adj list
                """
                append_jj_list(tl_word, tl_adj_list, il_adj, il_adj_list, curr_il_pos, next_il_pos, next2_il_pos, matched, sp_index, wp_index, il_adj_count_list, il_adj_count, tl_adj_count_list, tl_adj_sen)
            
            # 5. RB
            if tl_word_pos == 'RB':
                """
                Adverb Matching
                if the POS is a adverb, append the index of the adverb to the adverb list
                """
                append_rb_list(tl_word, tl_adv_list, il_adv, il_adv_list, curr_il_pos, next_il_pos, next2_il_pos, next3_il_pos, prev_il_pos, matched, sp_index, wp_index, il_adv_count_list, il_adv_count, tl_adv_count_list, tl_adv_sen)
            
            # 6. CC
            if tl_word_pos == 'CC':
                """
                Conjunction Matching
                if the POS is a conjunction, append the index of the conjunction to the conjunction list
                """
                append_cc_list(tl_word, tl_conj_list, il_conj, il_conj_list, curr_il_pos, matched, sp_index, wp_index, il_conj_count_list, il_conj_count, tl_conj_count_list, tl_conj_sen)
            
            # 7. PR
            if tl_word_pos == 'PR':
                """
                Preposition Matching
                if the POS is a preposition, append the index of the conjunction to the conjunction list
                """
                append_pr_list(tl_word, tl_prepo_list, il_prepo, il_prepo_list, curr_il_pos, next_il_pos, next2_il_pos, matched, sp_index, wp_index, il_prepo_count_list, il_prepo_count, tl_prepo_count_list, tl_prepo_sen)
            
            # 8. DT
            if tl_word_pos == 'DT':
                """
                Determiner Matching
                if the POS is a determiner, append the index of the conjunction to the conjunction list
                """
                append_dt_list(tl_word, tl_dt_list, il_dt, il_dt_list, curr_il_pos, next_il_pos, next2_il_pos, matched, sp_index, wp_index, il_dt_count_list, il_dt_count, tl_dt_count_list, tl_dt_sen)
            
                          
            wp_index += 1     
        sp_index += 1
    
    tl_verb_idf = get_idf(tl_verb_count_list)
    tl_noun_idf = get_idf(tl_noun_count_list)
    tl_adj_idf = get_idf(tl_adj_count_list)
    tl_adv_idf = get_idf(tl_adv_count_list)
    tl_conj_idf = get_idf(tl_conj_count_list)
    tl_prepo_idf = get_idf(tl_prepo_count_list)
    tl_dt_idf = get_idf(tl_dt_count_list)
    
    dict_tl_il_sw['Tagalog Single Words'] = tl_sw_list
    dict_tl_il_sw['Ilokano Single Words'] = il_sw_list
    
    dict_tl_il_vb['Tagalog Verb'] = tl_verb_list
    dict_tl_il_vb['Tagalog Verb IDF'] = tl_verb_idf
    dict_tl_il_vb['Ilokano Verb'] = il_verb_list
    dict_tl_il_vb['Ilokano Verb Count'] = il_verb_count_list
    
    dict_tl_il_nn['Tagalog Noun'] = tl_noun_list
    dict_tl_il_nn['Tagalog Noun IDF'] = tl_noun_idf
    dict_tl_il_nn['Ilokano Noun'] = il_noun_list
    dict_tl_il_nn['Ilokano Noun Count'] = il_noun_count_list
    
    dict_tl_il_jj['Tagalog Adjective'] = tl_adj_list
    dict_tl_il_jj['Tagalog Adjective IDF'] = tl_adj_idf
    dict_tl_il_jj['Ilokano Adjective'] = il_adj_list
    dict_tl_il_jj['Ilokano Adjective Count'] = il_adj_count_list
    
    dict_tl_il_rb['Tagalog Adverb'] = tl_adv_list
    dict_tl_il_rb['Tagalog Adverb IDF'] = tl_adv_idf
    dict_tl_il_rb['Ilokano Adverb'] = il_adv_list
    dict_tl_il_rb['Ilokano Adverb Count'] = il_adv_count_list
    
    dict_tl_il_cc['Tagalog Conjunction'] = tl_conj_list
    dict_tl_il_cc['Tagalog Conjunction IDF'] = tl_conj_idf
    dict_tl_il_cc['Ilokano Conjunction'] = il_conj_list
    dict_tl_il_cc['Ilokano Conjunction Count'] = il_conj_count_list
    
    dict_tl_il_pr['Tagalog Preposition'] = tl_prepo_list
    dict_tl_il_pr['Tagalog Preposition IDF'] = tl_prepo_idf
    dict_tl_il_pr['Ilokano Preposition'] = il_prepo_list
    dict_tl_il_pr['Ilokano Preposition Count'] = il_prepo_count_list
    
    dict_tl_il_dt['Tagalog Determiner'] = tl_dt_list
    dict_tl_il_dt['Tagalog Determiner IDF'] = tl_dt_idf
    dict_tl_il_dt['Ilokano Determiner'] = il_dt_list
    dict_tl_il_dt['Ilokano Determiner Count'] = il_dt_count_list
    
match_tl_il_pos()
# dict_tl_il_sw.head()
# dict_tl_il_vb.head(50)
# dict_tl_il_nn.head(50)
# dict_tl_il_jj.head(50)
# dict_tl_il_rb.head(50)
# dict_tl_il_cc.head(50)
# dict_tl_il_pr.head(50)
dict_tl_il_dt.head(50)



Unnamed: 0,Tagalog Determiner,Ilokano Determiner,Ilokano Determiner Count,Tagalog Determiner IDF
0,ng,"[None, iti, ti, coma, dagiti, ken, a, pay, idi...","[12026, 1093, 1792, 34, 667, 376, 1258, 7, 49,...",0.160186
1,ang,"[None, ti, a, dagiti, coma, ken, nga, iti, idi...","[8108, 2547, 754, 1124, 56, 367, 280, 456, 84,...",0.178226
2,nang,"[idi, iti, None, ti, nga, a, ken, ni, dagiti, ...","[344, 113, 600, 20, 71, 39, 5, 24, 5, 6, 2, 1,...",1.025117
3,ay,"[ti, None, a, ken, iti, dagiti, ni, cadagiti, ...","[414, 5404, 467, 96, 416, 107, 192, 63, 83, 13...",0.349164
4,sumasa,"[None, nga, a, idi, ken]","[14, 1, 2, 3, 1]",2.793755
5,na,"[None, nga, a, ti, ken, dagiti, iti, pay, cada...","[8466, 1033, 2211, 638, 245, 182, 584, 13, 125...",0.22193
6,sa,"[cadagiti, None, dagiti, nga, iti, ken, ti, a,...","[610, 15317, 326, 528, 2767, 546, 1333, 1376, ...",0.102409
7,nasa,"[None, nga, dagiti, iti, ti, ditoy, idi, a, it...","[379, 47, 10, 87, 38, 1, 22, 25, 2, 8, 9, 6, 2...",1.328373
8,mga,"[None, dagiti, ti, a, iti, nga, ken, ni, cadag...","[6845, 680, 577, 665, 342, 228, 350, 217, 289,...",0.339444
9,pa,"[None, pay, a, ti, dagiti, ni, idi, iti, ken, ...","[189, 19, 14, 18, 6, 4, 3, 9, 3, 2, 5, 1]",1.681486


## Exporting the dictionary in the json file

### Saving Single Word Dictionary 

In [15]:
import json

dict_sw = dict_tl_il_sw.to_dict('records')

try:
    with open("src/json data/Example-Based/dict_sw.json", "w") as outfile:
        json.dump(dict_sw, outfile)
    print("successfully saved the dict_sw.json file")
except:
    print("Error in saving the dict_sw.json file")

successfully saved the dict_sw.json file


### Saving Verb Dictionary 

In [16]:
dict_vb = dict_tl_il_vb.to_dict('records')

try:
    with open("src/json data/Example-Based/dict_vb.json", "w") as outfile:
        json.dump(dict_vb, outfile)
    print("successfully saved the dict_vb.json file")
except:
    print("Error in saving the dict_vb.json file")

successfully saved the dict_vb.json file


### Saving Noun Dictionary 

In [17]:
dict_nn = dict_tl_il_nn.to_dict('records')

try:
    with open("src/json data/Example-Based/dict_nn.json", "w") as outfile:
        json.dump(dict_nn, outfile)
    print("successfully saved the dict_nn.json file")
except:
    print("Error in saving the dict_nn.json file")

successfully saved the dict_nn.json file


### Saving Adjective Dictionary 

In [18]:
dict_jj = dict_tl_il_jj.to_dict('records')

try:
    with open("src/json data/Example-Based/dict_jj.json", "w") as outfile:
        json.dump(dict_jj, outfile)
    print("successfully saved the dict_jj.json file")
except:
    print("Error in saving the dict_jj.json file")

successfully saved the dict_jj.json file


### Saving Adverb Dictionary 

In [19]:
dict_rb = dict_tl_il_rb.to_dict('records')

try:
    with open("src/json data/Example-Based/dict_rb.json", "w") as outfile:
        json.dump(dict_rb, outfile)
    print("successfully saved the dict_rb.json file")
except:
    print("Error in saving the dict_rb.json file")

successfully saved the dict_rb.json file


### Saving Conjunction Dictionary 

In [20]:
dict_cc = dict_tl_il_cc.to_dict('records')

try:
    with open("src/json data/Example-Based/dict_cc.json", "w") as outfile:
        json.dump(dict_cc, outfile)
    print("successfully saved the dict_cc.json file")
except:
    print("Error in saving the dict_cc.json file")

successfully saved the dict_cc.json file


### Saving Preposition Dictionary 

In [21]:
dict_pr = dict_tl_il_pr.to_dict('records')

try:
    with open("src/json data/Example-Based/dict_pr.json", "w") as outfile:
        json.dump(dict_pr, outfile)
    print("successfully saved the dict_pr.json file")
except:
    print("Error in saving the dict_pr.json file")

successfully saved the dict_pr.json file


### Saving Determiner Dictionary 

In [22]:
dict_dt = dict_tl_il_dt.to_dict('records')

try:
    with open("src/json data/Example-Based/dict_dt.json", "w") as outfile:
        json.dump(dict_dt, outfile)
    print("successfully saved the dict_dt.json file")
except:
    print("Error in saving the dict_dt.json file")

successfully saved the dict_dt.json file
