# Translator of Documents 

## Tagalog Rule-Based Modeling

### Initialization of Data Sets

#### Opening and processing the Source Document

In [1]:
import pandas as pd

# Opening the file
test_doc = open("../../src/text data/testing data/Tagalog/tl_test_data_bible.txt", encoding='utf-8').read()
target_op = open("../../src/text data/testing data/Ilokano/il_test_data_bible.txt", encoding='utf-8').read()

# Splitting the raw data into sentences
parsed_test_doc = test_doc.split("\n")

##### Cleaning the Source Document

In [2]:
import string

def remove_punct(pText):
    text_nopunct = "".join([char for char in pText if char not in string.punctuation])
    return text_nopunct

cleaned_test_doc = [remove_punct(word) for word in parsed_test_doc]

In [3]:
import re

def tokenize(text):
    tokens = re.split('\W+', text.lower())
    
    for token in tokens:
        if token == '':
            tokens.remove(token)
    
    return tokens


tokenized_test_doc = [tokenize(word) for word in cleaned_test_doc]

In [4]:
dict_test_doc = pd.DataFrame({'Tokenized': tokenized_test_doc})  
dict_test_doc.head()


Unnamed: 0,Tokenized
0,"[at, pinatay, niya, ang, handog, na, susunugin]"
1,"[sa, gayoy, si, husai, na, kaibigan, ni, david..."
2,"[at, sinabi, ng, hari, sa, kaniya, makailang, ..."
3,"[na, iyong, itakuwil, ang, gawa, ng, iyong, mg..."
4,"[itoy, sa, mga, nangaghihimagsik, laban, sa, l..."


#### Setting up the important data sets

In [5]:
# Verb Dictionary
verb_dict = pd.read_json('../../src/json data/Tagalog to Ilokano/verb_dict.json')

# Adjective Dictionary
adj_dict = pd.read_json('../../src/json data/Tagalog to Ilokano/adj_dict.json')

#### Setting up the important lists and variables

In [6]:
"""
Determiner Lists
"""
noun_dtmn_list = ["ang", "ng", "mga", "si", "ay", "ni", "sa", "y"] # Noun Determiners

adv_dtmn_list = ["nang"]

prepo_dtmn_list = ["sa", "nasa", "sumasa"]

adv_time_list = ['mamaya', 'ngayon', 'kahapon', 'bukas', 'pagkatapos', 'ngayong gabi', 'sa ngayon', 'kagabi', 'itong umaga', 'susunod na linggo', 'na', 'kamakailan lamang', 'kani-kanina lamang', 'maaga', 'kaagad', 'pa rin', 'pa', 'nakaraan']

adv_place_list = ['dito', 'doon', 'sa dako roon', 'sa lahat ng dako', 'kahit saan', 'wala kahit saan', 'tahanan', 'malayo', 'palabas']

adv_manner_list = ['tunay', 'lubos', 'medyo', 'mabilis', 'mabuti', 'mahirap', 'dahan-dahan', 'parang hindi', 'bahagya', 'halos lahat', 'halos', 'walang pasubali', 'sama-sama', 'nag-iisa']

adv_freq_list = ['lagi', 'madalas', 'karaniwan', 'kung minsan', 'paminsan-minsan', 'bihira', 'madalang', 'hindi kailanman'] 

In [7]:
""" 
    Affixes
"""
PREFIX_SET = [
    'nakikipag', 'pakikipag',
    'pinakama', 'pagpapa',
    'pinagka', 'panganga',
    'makapag', 'nakapag',
    'tagapag', 'makipag',
    'nakipag', 'tigapag',
    'pakiki', 'magpa',
    'napaka', 'pinaka',
    'ipinag', 'pagka',
    'pinag', 'mapag',
    'mapa', 'taga',
    'ipag', 'tiga',
    'pala', 'pina',
    'pang', 'naka',
    'nang', 'mang',
    'sing', 'ma', # 'ma' is a prefix in Tagalog for Adjectives, Adverbs, and Verbs
    'ipa', 'pam',
    'pan', 'pag',
    'tag', 'mai',
    'mag', 'nam',
    'nag', 'man',
    'may', 
    'na', 'ni',
    'pa', 'ka',
    'um', 'in',
    'i', 'nagpa', 
    'magka', 'nagka',
    'ini'    
]

Adj_Prefix = [
    'ma'
]

INFIX_SET = [
    'um', 'in',
]

SUFFIX_SET = [
    'syon','dor',
    'ita', 'han',
    'hin', 'ing',
    'ang', 'ng',
    'an', 'in',
    'g',
]

PREPO_SET = [
    'gitna',            #removed "sumasa", transferred to prepo_dtmn_list since it is often placed before prepositions
    'ibabaw', 'ilalim',
    'itaas', 'ibaba', 
    'baba', 'taas',
    'harap', 'likod', 
    'labas', 'loob',
    'pagitan', 'unahan', 
    'dulo', 'tabi', 'yan'
]

CONJ_SET = [
    'at', 'bali', 
    'dahil', 'datapwat', 
    'habang', 'kahit', 
    'kapag', 'kasi', 
    'kaso', 'kaya', 
    'kaysa', 'nang',
    'na', 'ngunit', 
    'ni',  'o', 
    'para', 'pati', 
    'pero', 'porket', 
    'saka', 'samantala', 
    'subalit', 'tsaka', 
    'tuwing', 'upang',
    'imbes' 
]

ADV_SET = [
    'rin', 'din', 'ring', 'ding'
]

PER_PRONOUN = [
    'ako', 'ikaw', 'siya', 'kami', 'kayo', 'sila',
    'akong', 'siyang', 'kaming', 'kayong', 'silang'
    'ko', 'akin', 'sakin', 'amin', 'atin', 'inyo',
    'kong', 'inyong', 'ating', 'saking', 'aming', 'aking',
    'kata', 'mo', 'kanila', 'kanya', 'namin', 'natin',
    'katang', 'mong', 'kanilang', 'kanyang', 'kaniyang',
    'ninyo', 'niya', 'kayoy', 'ikay', 'akoy', 'siyay', 'kamiy',
    'ninyong', 'niyang', 'mare', 'pare', 'kumpare', 'kumare',
    'silay', 'inyoy', 'kanilay', 'kanyay', 'niyay',
    'tayo', 'ka'
]


In [8]:
""" 
    Other Lists
"""
vowels = ['a', 'e', 'i', 'o', 'u']

### Setting up the checker functions

In [9]:
"""
    Determiner Checker Function
"""
def isDtmn(word):
    """
    This function checks if the specific word in the sentence is a determiner, and extracts it.
    """
    if word in (noun_dtmn_list + adv_dtmn_list + prepo_dtmn_list + adv_time_list):
        ans = True
    else:
        ans = False

    return ans

In [10]:
"""
    Verb Affixer Checker Function
"""
def check_verb_affixes(word, prev2_word, prev_word, next_word, isTagged, hasVerbAffixes):
    """
    This function checks if the specific word in the sentence has an affix, and extracts it.
    """
    for prefix in PREFIX_SET:
        if word.startswith(prefix) and not isTagged:
            if word.startswith("mag") or word.startswith("nag"):
                if  word[3:5] == word[5:7] and not isTagged:
                    """
                    verbs starting with "mag" or "nag" always repeat the next 4 letters of the word 
                    e.g. maglalakad, maglalaro, magbibihis | naglalakad, naglalaro, nagbibihis
                    issue: magkakampi,
                    """
                    hasVerbAffixes = True
                    isTagged = True

                if word[3] in (vowels):
                    """
                    verbs starting with "mag" and if the next letter is a vowel, the vowel is repeated 
                    e.g. magiikot, magaayos, maguusap | nagiikot, nag-aayos, nag-uusap
                    """
                    if word[3] == word[4] and not isTagged:
                        hasVerbAffixes = True
                        isTagged = True
                        
                if (word.startswith("magka") or word.startswith("nagka")) and not isTagged:
                    """
                    verbs starting with "magka" or "nagka"  
                    e.g. magkaroon, magkasama, magkasundo (usually r,s, or vowels)
                    issue: magkapatid, magkatyempo
                    """
                    hasVerbAffixes = True
                    isTagged = True
                    
            else:
                hasVerbAffixes = True
                isTagged = True
                
    for infix in INFIX_SET:
        if word.__contains__(infix) and not isTagged:
            hasVerbAffixes = True
            isTagged = True
            
    for suffix in SUFFIX_SET:
        """
        words ending with 'ang' are adverbs and after the adverbs are the nouns 
        """
        if word.endswith(suffix) and not isTagged and not word.endswith("ang") and not prev_word.endswith("ang"):
            hasVerbAffixes = True
            isTagged = True

    if len(word) >= 4:
        if word[:2] == word[2:4] and not isTagged:
            """
            if the first four characters of a word is repeated, then it is a verb
            """
            hasVerbAffixes = True
            isTagged = True
    
    return hasVerbAffixes
# end of check_verb_affixes()

In [11]:
"""
    Verb Checker Function
"""
def isVerb(word, prev_word, next_word, hasVerbAffixes):
    """
    This function tags if the specific word in the sentence is a verb, and extracts it.
    """
    isDone = False
    isVerb = False
    
    if word not in (PREPO_SET + PER_PRONOUN + CONJ_SET + ADV_SET):
        if prev_word not in (noun_dtmn_list + adv_dtmn_list + prepo_dtmn_list): 
            if next_word in (noun_dtmn_list): 
                """
                if the previous word is not in the noun, adverb, and preposition determiner and 
                the next word is a noun determiner
                eg. !(sayaw ng bata)
                """
                if hasVerbAffixes:
                    """
                    if the current word has a verb affix/es, then it is a verb
                    """
                    isVerb = True
                    isDone = True
            
            if next_word in PER_PRONOUN and not isDone:
                """
                if the next word is a personal pronoun
                eg. sayaw ka
                issue: if the next word is a personal pronoun, it is not always a verb
                eg. bastos ka
                """
                isVerb = True
                isDone = True

        if prev_word == "ay" and hasVerbAffixes and not isDone:
            if next_word in ("ng", "sa", "nang", "na") or next_word is None:
                """
                if the previous word is 'ay' and the next word is 'ng' or 'sa', then it is a verb
                eg. ay naglalakad na bata | ay naglalakad
                isse: ay nanay
                """
                isVerb = True
                isDone = True
                
        if prev_word == 'na' and hasVerbAffixes and not isDone:
            """
            if the previous word is 'na' and the current word has a verb affix/es, then it is a verb
            eg. na naglakad
            issue: na mabait
            """
            if word.startswith("ma") and len(word) == 5:
                if word[4] in vowels:
                    isVerb = False
                    isAdj = True
                    isDone = True

        if not isDone:
        #if word and not isDone:
            if word[:5] in ("magpa", "nagka") or word[:4] in ("napa", "naka") or word[:3] in ("nag"):
            # if hasAffixes and not isDone:
                """
                if the first five characters of a word start with "magpa" or "nagka" of "pagkla", then it is a verb
                eg. magpapakain, nagkakasakit
                """
                isVerb = True
                isDone = True
            if word[:3] in ("mag"):
                if next_word in (PER_PRONOUN, "sa", "ni", "nang"):
                    """
                    if the first three characters of a word start with "mag", then it is a verb
                    eg. mag-ayos ka
                    """
                    isVerb = True
                    isDone = True
    
        if hasVerbAffixes and prev_word == None and not isDone:
            if next_word in PER_PRONOUN or (next_word in noun_dtmn_list and next_word not in ('ng', 'mga')):
                """
                Isinulat niya
                """
                isVerb = True
                isDone = True
                
        # The Algorithm Below is for the words that are not tagged yet
        for verb_su in verb_dict['Salitang-ugat']:
            """
            for every verb in the verb dictionary salitang-ugat
            """
            if word == verb_su and not isDone:
                """
                if the current word is in the verb dictionary salitang-ugat, then it is a verb
                """
                isVerb = True
                isDone = True
        
        for verb_pn in verb_dict['Pangnagdaan']:
            """
            for every verb in the verb dictionary Pangnagdaan
            """
            if word == verb_pn and not isDone:
                """
                if the current word is in the verb dictionary Pangnagdaan, then it is a verb
                """
                isVerb = True
                isDone = True
                
        for verb_pk in verb_dict['Pangkasalukuyan']:
            """
            for every verb in the verb dictionary Pangkasalukuyan
            """
            if word == verb_pk and not isDone:
                """
                if the current word is in the verb dictionary Pangkasalukuyan, then it is a verb
                """
                isVerb = True
                isDone = True
                
        for verb_ph in verb_dict['Panghinaharap']:
            """
            for every verb in the verb dictionary Panghinaharap
            """
            if word == verb_ph and not isDone:
                """
                if the current word is in the verb dictionary Panghinaharap, then it is a verb
                """
                isVerb = True
                isDone = True
                
        for verb_pw in verb_dict['Pawatas']:
            """
            for every verb in the verb dictionary Pawatas
            """
            if word == verb_pw and not isDone:
                """
                if the current word is in the verb dictionary Pawatas, then it is a verb
                """
                isVerb = True
                isDone = True
                
        for verb_kt in verb_dict['Katatapos']:
            """
            for every verb in the verb dictionary Katatapos
            """
            if word == verb_kt and not isDone:
                """
                if the current word is in the verb dictionary Katatapos, then it is a verb
                """
                isVerb = True
                isDone = True
            
    return isVerb
# end of function

In [12]:
"""
    Noun Checker Function
"""
def isNoun(word, prev_word, prev2_word, next_word, next2_word):
    """
    This function tags if the specific word in the sentence is a noun, and extracts it.
    """
    isDone = False
    isNoun = False
    adj_prefix = ["ika", "pinaka", "pang"]
    adj_suffix = ["ng"]

    if prev_word in (noun_dtmn_list) and word not in noun_dtmn_list and not isDone:
        """
        if the previous word is a determiner and the word is not a determiner, then it is a noun
        eg. !(ng mga)
        """
        isAdj = False
        
        if word.endswith("ng") and len(word.replace("ng", "")) > 3:
            """
            if the word ends with 'ng' and length of the word when 'ng' is removed is greater than 3, then it is an adjective
            eg. ang mabuting tao
            """
            isAdj = True
        
        if not isAdj:
            for prefix in adj_prefix:
                """
                if the word is an adjective it has an adjective prefix
                eg. ika-ayos, pinakamahusay, pangaraw-araw
                """
                if not isDone:
                    isAdj = word.startswith(prefix)
                if not isAdj and not isDone:
                    if prev_word == 'ang':
                        """
                        if the previous word is 'ang' and not an adjective, then it is a noun
                        eg. ang espiritu
                        """
                        isNoun = True
                        isDone = True
                        
                    if next_word != 'ng' and not isDone:
                        isNoun = True
                        isDone = True
                if isAdj:  
                    isDone = True
    
    if prev_word == "sa" and word not in(PREPO_SET)and not isDone:
        """
        if the previous word is "sa" and the word is not in the PREPO_SET then it is a noun
        eg. sa simbahan <- tags "simbahan"
        """
        isNoun = True
        isDone = True
        
    if prev2_word == "ay" and prev_word.endswith("ang") and word not in noun_dtmn_list and not isDone:
        """
        if the previous previous word is "ay" and the previous word is "ang" 
        and the word is not a determiner then the word is a noun
        eg. ay ang bata
        """
        isNoun = True
        isDone = True
    
    if prev_word.endswith("ng"):
        """
        if the previous word ends with "ng" and the prev word is not in noun_dtmn_list/conj_set/adv_dtm_list then it is a noun
        eg. upang magpuno sa gabi <- prevents magpuno to be tagged as noun | ikalawang araw <- tags araw
        """
        if prev_word not in (noun_dtmn_list + CONJ_SET + adv_dtmn_list) and not isDone:
            isNoun = True
            isDone = True
        
        if prev_word.startswith("ma") and prev_word.endswith("ng") and not isDone:
            if not word.endswith("ng"):
                isNoun = True
                isDone = True

    if prev_word == "na" and not isDone:
        if prev2_word.startswith("ma") or prev2_word.startswith("ika") or prev2_word ==  CONJ_SET:
            isNoun = True
            isDone = True

    if next_word == "na":
        if next2_word.startswith("na") or next2_word.startswith("ma"):
            isNoun = True
            isDone = True
    
    if prev_word == "ng" and next_word == "na":
        if next2_word.startswith("ma"): # nagpagawa siya ng gusali na mataas
            isNoun = True
            isDone = True
        else:
            isNoun = False
            isDone = True

    if prev_word.endswith("ng") and word.endswith("ng"):
            # untags "dalawang malaking" <- untags malaking
            isNoun = False
            isDone = False
            isAdj = True

    if prev_word in (noun_dtmn_list) and word.endswith("ng"): 
        # this untags words like "unang" e.g. "ang unang araw" <- untags "unang" as a noun and tags it as an adj
        isDone = True
        isNoun = False
    
    if prev_word == "sa" and next_word == "na":
        # untags adjectives placed between "sa" and "na" e.g. "sa maliit na lamesa"
        isNoun = False
        isDone = False

    if word in PER_PRONOUN:
        """
        if the word is a personal pronoun, then it is a noun
        eg. ako, ikaw, tayo, etc.
        """
        isNoun = True
        isDone = True
    
    return isNoun
# end of function

In [13]:
"""
    Adjective Checker Function
"""
def isAdj(word, prev_word, prev2_word, next_word, hasVerbAffixes):
    """
    This function tags if the specific word in the sentence is an adjective, and extracts it.
    """
    isDone = False
    isAdj = False
    
    if word not in (noun_dtmn_list + adv_dtmn_list + prepo_dtmn_list + PREPO_SET + PER_PRONOUN + CONJ_SET):
        if word.startswith("ma") and (next_word in noun_dtmn_list or next_word == 'na') and next_word not in ('ay', 'ng', 'mga') and  not hasVerbAffixes and not isDone:
            """
            if the word is an adjective it has an adjective prefix 'ma' and the next word is noun determiner
            eg. maayos na ang kalsada
            """
            isAdj = True
            isDone = True
        
        if word.startswith("napaka") or word.startswith("pinakama") or word.startswith("pinaka") and not hasVerbAffixes and not isDone:
            """
            if the word starts with 'pinakama' or 'pinaka' or 'napaka', then it is an adjective
            eg. pinakamaganda, pinakagusto, napakaganda
            """
            isAdj = True
            isDone = True
        
        if word.startswith("nag") and word[3:5] == word[5:7] and word.endswith("han") and not hasVerbAffixes and not isDone:
            """
            if the word starts with 'nag' then followed by repeating syllable then ends with 'han', then it is an adjective
            eg. naglalakihan, naggagandahan
            """
            isAdj = True
            isDone = True
        
        if word.startswith("ma") and word[2:4] == word[4:6] and not hasVerbAffixes and not isDone:
            """
            if the word starts with 'ma' then followed by repeating syllable, then it is an adjective
            eg. malalaki, magaganda
            """
            isAdj = True
            isDone = True
        
        if word.startswith("an") and not hasVerbAffixes and not isDone:
            """
            if the word starts with 'an' then it is an adjective
            eg. anlaki, ansarap
            """
            isAdj = True
            isDone = True
            
        if prev_word == 'ang' and next_word == 'ng' and not hasVerbAffixes and not isDone:
            """
            if the prev word is 'ang' then the word is an adjective
            eg. ang ganda ng bulaklak
            """
            isAdj = True
            isDone = True
            
        if word.startswith("ma") and prev_word in noun_dtmn_list  and (next_word == 'na') and not hasVerbAffixes and not isDone:
            """
            if the prev word is 'ang' then the word is an adjective
            eg. naghanda ng malamig na coke
            """
            isAdj = True
            isDone = True
            
        if prev_word == 'mas' and not hasVerbAffixes and not isDone:
            """
            if the prev word is 'mas' then the word is an adjective
            eg. mas maganda
            """
            isAdj = True
            isDone = True
        
        if word.endswith("ng") and not hasVerbAffixes and not isDone:
            """
            if the word ends with 'ng', then it is an adjective
            eg. dalawang bahay
            """
            isAdj = True
            isDone = True
        
        if prev_word in ('ay', 'na') and not prev2_word.startswith('ika') and (not hasVerbAffixes or word.startswith('ma')) and not isDone:
            """
            if the previous word is 'ay' or 'na', then it is an adjective
            eg. salamin na parihaba
            """
            isAdj = True
            isDone = True
        
    return isAdj
# end of function

In [14]:
"""
    Palindrome Checker Function
"""

def isPalindrome(word): 
    """
    This function checks if the word is a palindrome.
    """
    
    """
    gets the half length of the word
    """
    half_len = len(word)/2
    half_len = int(half_len)
    
    if word[:half_len] == word[half_len:] and half_len > 2:
        return True
    else:
        return False
# end of function

In [15]:
"""
    Adverb Checker Function
"""

def isAdv(word, prev_word, next_word, hasVerbAffixes):
    """
    This function tags if the specific word in the sentence is an adverb, and extracts it.
    """
    isDone = False
    isAdv = False

    if word.startswith('ma') and not word.startswith('mag') and (next_word in PER_PRONOUN or next_word == 'na') and next_word not in ('ay', 'ng', 'mga') and not isDone:
        """
        if the word is an adverb it has an adverb prefix 'ma' and the next word is a pronoun
        eg. mabilis na magsulat
        """
        isAdv = True
        isDone = True
    
    if prev_word == 'nang' and not isDone:
    # if prev_word == 'nang' and (not hasVerbAffixes or (word.startswith('ma') and not word.startswith('mag'))) and next_word not in ('ay', 'ng', 'mga') and not isDone:
        """
        if the previous word is 'nang'
        """
        if next_word not in ('ay', 'ng', 'mga'):
            """
            if the next word is not "ay, ng, or mga"
            """
            if not hasVerbAffixes: 
                """
                if word not have verb affixes, then it is an adverb
                eg. nang husto
                """
                isAdv = True
                isDone = True
                
            if word.startswith('ma') and not isNoun and not isDone:
                """
                if starts with 'ma', then it is an adverb
                eg. nang mabilis
                """
                isAdv = True
                isDone = True
            
        if next_word == 'ay' and not isDone: 
            """
            if the next word is "ay"
            """   
            if word.startswith('pa') and not isDone:
                """
                if ends with 'pa', then it is an adverb
                eg. nang pasimula ay
                """
                isAdv = True
                isDone = True
        
    if word in adv_time_list and not isDone:
        """
        if the word is an adverb of time, then it is an adverb
        eg. aalis bukas
        """
        isAdv = True
        isDone = True
        
    if word in adv_freq_list and not isDone:
        """
        if the word is an adverb of frequency, then it is an adverb
        
        """
        isAdv = True
        isDone = True
        
    if word in adv_place_list and not isDone:
        """
        if the word is an adverb of place, then it is an adverb
        
        """
        isAdv = True
        isDone = True    
        
    if word in adv_manner_list and not isDone:
        """
        if the word is an adverb of manner, then it is an adverb
        
        """
        isAdv = True
        isDone = True
        
    if next_word == 'na' and not hasVerbAffixes and not isDone:
        """
        if the next word is 'na' then the word is an adverb
        eg. tunay na maganda
        """
        isAdv = True
        isDone = True
    
    if prev_word.startswith('ma') and not prev_word.startswith('mag') and (hasVerbAffixes or word.startswith('mag')) and not isDone:
        """
        if the previous word is an adverb the word is a verb
        eg. mabagal magpalit
        """
        isAdv = True
        isDone = True
        
    if isPalindrome(word) and not isDone:
        """
        if the word is a palindrome then it is an adverb
        eg. dahandahan (dahan-dahan) siya
        """
        isAdv = True
        isDone = True
    
    if word.__contains__('ng') and not isDone:
        """
        if the word contains 'ng' then it is an adverb
        """
        
        temp_word = word.replace('ng', '')
        
        if isPalindrome(temp_word):
            """
            if the temporary word is a palindrome then it is an adverb
            eg. sobrangsobra (sobrang-sobra) siya
            """
            isAdv = True
            isDone = True
                       
    return isAdv
# end of function

In [16]:
"""
    Preposition Checker Function
"""

def isPrepo(word, prev_word):
    """
    This function checks if the specific word in the sentence is a preposition, and extracts it.
    """
    isPrepo = False
    
    if prev_word in (prepo_dtmn_list) and word in (PREPO_SET):
        isPrepo = True
        
    return isPrepo
# end of function

In [17]:
"""
    Conjunction Checker Function
"""

def isConj(word):
    """
    This function checks if the specific word in the sentence is a conjunction
    """
    if word in CONJ_SET:
        return True
    else:
        return False
# end of function

### Setting up the models

In [18]:
""""
    Tagalog to Ilokano Dictionaries
"""

# Single Words Dictionary
dict_sw = pd.read_json('../../src/json data/Tagalog to Ilokano/Example-Based/dict_sw.json')
dict_vb = pd.read_json('../../src/json data/Tagalog to Ilokano/Example-Based/dict_vb.json')
dict_nn = pd.read_json('../../src/json data/Tagalog to Ilokano/Example-Based/dict_nn.json')
dict_jj = pd.read_json('../../src/json data/Tagalog to Ilokano/Example-Based/dict_jj.json')
dict_rb = pd.read_json('../../src/json data/Tagalog to Ilokano/Example-Based/dict_rb.json')
dict_cc = pd.read_json('../../src/json data/Tagalog to Ilokano/Example-Based/dict_cc.json')
dict_pr = pd.read_json('../../src/json data/Tagalog to Ilokano/Example-Based/dict_pr.json')
dict_dt = pd.read_json('../../src/json data/Tagalog to Ilokano/Example-Based/dict_dt.json')

In [19]:
"""
    Putting the columns in a list
"""
sw_tl_list = dict_sw['Tagalog Single Words'].tolist()
sw_il_list = dict_sw['Ilokano Single Words'].tolist()
vb_tl_list = dict_vb['Tagalog Verb'].tolist()
vb_il_list = dict_vb['Ilokano Verb'].tolist()
nn_tl_list = dict_nn['Tagalog Noun'].tolist()
nn_il_list = dict_nn['Ilokano Noun'].tolist()
jj_tl_list = dict_jj['Tagalog Adjective'].tolist()
jj_il_list = dict_jj['Ilokano Adjective'].tolist()
rb_tl_list = dict_rb['Tagalog Adverb'].tolist()
rb_il_list = dict_rb['Ilokano Adverb'].tolist()
cc_tl_list = dict_cc['Tagalog Conjunction'].tolist()
cc_il_list = dict_cc['Ilokano Conjunction'].tolist()
pr_tl_list = dict_pr['Tagalog Preposition'].tolist()
pr_il_list = dict_pr['Ilokano Preposition'].tolist()
dt_tl_list = dict_dt['Tagalog Determiner'].tolist()
dt_il_list = dict_dt['Ilokano Determiner'].tolist()

### Tagger

In [20]:
def tag(sentence_list):
    isTagged = None
    hasVerbAffixes = None
    pos_sen_list = []
    """
    instantiations of the variables
    """

    for sentence in sentence_list:
        pos_list = []
        prev_word = ""
        prev2_word = ""
        sen_len = len(sentence)
        """
        instantiations of the variables
        """
        
        for word in sentence:
            
            isTagged = False
            hasVerbAffixes = False
            """
            instantiations of the variables
            """
            
            try:
                next_word = sentence[sentence.index(word) + 1]
            except (ValueError, IndexError):
                next_word = ""
            """
            gets the next word in the sentence
            """
            
            try:
                next2_word = sentence[sentence.index(word) + 2]
            except (ValueError, IndexError):
                next2_word = ""
            """
            gets the next word in the sentence
            """
                
            try:
                hasVerbAffixes = check_verb_affixes(word, prev2_word, prev_word, next_word, isTagged, hasVerbAffixes)
            except (ValueError, IndexError):
                hasVerbAffixes = False
            """
            checks if the word has verb affixes
            """
            
            if sen_len == 1:
                """
                if the sentence is only one word long
                """
                pos_list.append('SW')
                isTagged = True

            elif isDtmn(word) and not isTagged:
                """
                checks if the word is a determiner
                """
                pos_list.append('DT')
                isTagged = True
                
            elif isConj(word) and not isTagged:
                """
                checks if the word is a conjunction and not tagged
                """
                pos_list.append('CC')
                isTagged = True
                
            elif isVerb(word, prev_word, next_word, hasVerbAffixes) and not isTagged:
                """
                checks if the word is a verb and not tagged
                """
                pos_list.append('VB')
                isTagged = True

            elif isNoun(word, prev_word, prev2_word, next_word, next2_word) and not isTagged:
                """
                checks if the word is a noun and not tagged
                """
                pos_list.append('NN')
                isTagged = True
            
            elif isAdv(word, prev_word, next_word, hasVerbAffixes) and not isTagged:
                """
                checks if the word is an adverb and not tagged
                """
                pos_list.append('RB')
                isTagged = True
                
            elif isAdj(word, prev_word, prev2_word, next_word, hasVerbAffixes) and not isTagged:
                """
                checks if the word is an adjective and not tagged
                """
                pos_list.append('JJ')
                isTagged = True
                    
            elif isPrepo(word, prev_word) and not isTagged:
                """
                checks if the word is a preposition and not tagged
                """
                pos_list.append('PR')
                isTagged = True
                        
            else:
                """
                if the word is not tagged, then it is an unknown word
                """
                pos_list.append('UNK')
                isTagged = True
            
            prev_word = word
            """
            getting the previous word
            """
            
            try:
                prev2_word = sentence[sentence.index(word) - 1]
            except (ValueError, IndexError):
                prev2_word = None
            """
            getting the previous after the previous word
            """
            
        pos_sen_list.append(pos_list)
        """
        storing the words in the list to the list of sentences
        """
        
    dict_test_doc['POS'] = pos_sen_list

tag(dict_test_doc['Tokenized'])

In [21]:
dict_test_doc.head()

Unnamed: 0,Tokenized,POS
0,"[at, pinatay, niya, ang, handog, na, susunugin]","[CC, VB, NN, DT, NN, DT, UNK]"
1,"[sa, gayoy, si, husai, na, kaibigan, ni, david...","[DT, NN, DT, NN, DT, VB, DT, NN, DT, VB, DT, N..."
2,"[at, sinabi, ng, hari, sa, kaniya, makailang, ...","[CC, VB, DT, NN, DT, NN, UNK, NN, UNK, DT, UNK..."
3,"[na, iyong, itakuwil, ang, gawa, ng, iyong, mg...","[DT, UNK, VB, DT, VB, DT, UNK, DT, NN]"
4,"[itoy, sa, mga, nangaghihimagsik, laban, sa, l...","[VB, DT, DT, NN, VB, DT, NN]"


#### Token Combiner

In [22]:
def combine_tokens(sen_translation_list):
    temp_sen_list = []

    for sen_translation in sen_translation_list:
        temp_sen = ''
        for word_translation in sen_translation:
            temp_index = sen_translation.index(word_translation)
            if temp_index == len(sen_translation) - 1:
                temp_sen += word_translation
            else:
                temp_sen += word_translation + ' '
        temp_sen_list.append(temp_sen)
    
    return temp_sen_list

In [23]:
f_phrases = pd.read_csv('../../src/csv data/f_phrases.csv')
il_phrases = f_phrases['Ilokano'].to_list()
il_phrases = [remove_punct(word) for word in il_phrases]
il_phrases = [tokenize(word) for word in il_phrases]

tl_phrases = f_phrases['Tagalog'].to_list()
tl_phrases = [remove_punct(word) for word in tl_phrases]
tl_phrases = [tokenize(word) for word in tl_phrases]

In [24]:
def inFPhrases(word, word2, word3, word4, word5, word6, word7, tl_phrases):
    inFPhrases = False
    tl_phrase = []
    w_used = 0
    for phrase in tl_phrases:
        length = len(phrase)
        if length == 7:
            if word == phrase[0] and word2 == phrase[1] and word3 == phrase[2] and word4 == phrase[3] and word5 == phrase[4] and word6 == phrase[5] and word7 == phrase[6]:
                inFPhrases = True
                tl_phrase = phrase
                w_used = 7
                break
        if length == 6:
            if word == phrase[0] and word2 == phrase[1] and word3 == phrase[2] and word4 == phrase[3] and word5 == phrase[4] and word6 == phrase[5]:
                inFPhrases = True
                tl_phrase = phrase
                w_used = 6
                break
        if length == 5:
            if word == phrase[0] and word2 == phrase[1] and word3 == phrase[2] and word4 == phrase[3] and word5 == phrase[4]:
                inFPhrases = True
                tl_phrase = phrase
                w_used = 5
                break
        if length == 4:
            if word == phrase[0] and word2 == phrase[1] and word3 == phrase[2] and word4 == phrase[3]:
                inFPhrases = True
                tl_phrase = phrase
                w_used = 4
                break
        if length == 3:
            if word == phrase[0] and word2 == phrase[1] and word3 == phrase[2]:
                inFPhrases = True
                tl_phrase = phrase
                w_used = 3
                break
        if length == 2:
            if word == phrase[0] and word2 == phrase[1]:
                inFPhrases = True
                tl_phrase = phrase
                w_used = 2
                break
        if length == 1:
            if word == phrase[0]:
                inFPhrases = True
                tl_phrase = phrase
                w_used = 1
                break 
                
    return inFPhrases, tl_phrase, w_used
# end of function

### Translator

In [25]:
def translate(sen_poss_list):
    sp_index = 0 # sentence POS index
    sen_translation_list = []
    
    for sen_poss in sen_poss_list:
        # loop for getting the pos structure of every sentence
        """
        sen_poss is a list of POS of a sentence
        eg. ['VB', 'DT', 'NN', 'DT', 'NN']
        """
        sen_translation = []
        
        wp_index = 0 # word POS index
        cur_wp_index = 0
        
        for word_pos in sen_poss:
            if wp_index == cur_wp_index:
                word = dict_test_doc['Tokenized'][sp_index][wp_index]
                # gets the word in every sentence
                
                try: 
                    word2 = dict_test_doc['Tokenized'][sp_index][wp_index+1]
                except:
                    word2 = None
                try:
                    word3 = dict_test_doc['Tokenized'][sp_index][wp_index+2]
                except:
                    word3 = None
                try:
                    word4 = dict_test_doc['Tokenized'][sp_index][wp_index+3]
                except:
                    word4 = None
                try:
                    word5 = dict_test_doc['Tokenized'][sp_index][wp_index+4]
                except:
                    word5 = None
                try:
                    word6 = dict_test_doc['Tokenized'][sp_index][wp_index+5]
                except:
                    word6 = None
                try:
                    word7 = dict_test_doc['Tokenized'][sp_index][wp_index+6]
                except:
                    word7 = None
                    
                ans = inFPhrases(word, word2, word3, word4, word5, word6, word7, tl_phrases)
                inFPDict = ans[0]
                tl_phrase = ans[1]
                w_used = ans[2]                
                
                if inFPDict:
                    """
                    if the word is in the list of Tagalog phrases
                    """
                    p_index = tl_phrases.index(tl_phrase)
                    il_phrase = il_phrases[p_index]
                    for il_word in il_phrase:
                        sen_translation.append(il_word)
                    cur_wp_index = wp_index + w_used
                    
                else:
                    cur_wp_index = wp_index + 1
                
                    # Matching Conditions    
                    # 1. SW
                    if word_pos == 'SW':
                        """
                        if the POS of the word is 'SW'
                        """
                        if word in sw_tl_list:
                            """
                            if the word is in the Tagalog list of single words
                            """
                            temp_index = sw_tl_list.index(word)
                            isNone = False
                            if sw_il_list[temp_index][0] == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(sw_il_list[temp_index][0])
                        else:
                            sen_translation.append(word)
                    
                    # 2. SW
                    elif word_pos == 'VB':
                        """
                        if the POS of the word is 'VB'
                        """
                        if word in vb_tl_list:
                            """
                            if the word is in the Tagalog list of verbs
                            """
                            temp_index = vb_tl_list.index(word)
                            isNone = False
                            if vb_il_list[temp_index][0] == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(vb_il_list[temp_index][0])
                        else:
                            sen_translation.append(word)
                            
                    # 3. NN
                    elif word_pos == 'NN':
                        """
                        if the POS of the word is 'NN'
                        """
                        if word in nn_tl_list:
                            """
                            if the word is in the Tagalog list of nouns
                            """
                            temp_index = nn_tl_list.index(word)
                            isNone = False
                            if nn_il_list[temp_index][0] == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(nn_il_list[temp_index][0])
                        else:
                            sen_translation.append(word)
                                        
                    # 4. JJ
                    elif word_pos == 'JJ':
                        """
                        if the POS of the word is 'JJ'
                        """
                        if word in jj_tl_list:
                            """
                            if the word is in the Tagalog list of nouns
                            """
                            temp_index = jj_tl_list.index(word)
                            isNone = False
                            if jj_il_list[temp_index][0] == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(jj_il_list[temp_index][0])
                        else:
                            sen_translation.append(word)
                                    
                    # 5. RB
                    elif word_pos == 'RB':
                        """
                        if the POS of the word is 'RB'
                        """
                        if word in rb_tl_list:
                            """
                            if the word is in the Tagalog list of nouns
                            """
                            temp_index = rb_tl_list.index(word)
                            isNone = False
                            if rb_il_list[temp_index][0] == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(rb_il_list[temp_index][0])
                        else:
                            sen_translation.append(word)
                            
                    # 6. CC
                    elif word_pos == 'CC':
                        """
                        if the POS of the word is 'CC'
                        """
                        if word in cc_tl_list:
                            """
                            if the word is in the Tagalog list of nouns
                            """
                            temp_index = cc_tl_list.index(word)
                            isNone = False
                            if cc_il_list[temp_index][0] == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(cc_il_list[temp_index][0])
                        else:
                            sen_translation.append(word)
                                    
                    # 7. PR
                    elif word_pos == 'PR':
                        """
                        if the POS of the word is 'CC'
                        """
                        if word in pr_tl_list:
                            """
                            if the word is in the Tagalog list of nouns
                            """
                            temp_index = pr_tl_list.index(word)
                            if pr_il_list[temp_index][0] == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(pr_il_list[temp_index][0])
                        else:
                            sen_translation.append(word)
                            
                    # 7. DT
                    elif word_pos == 'DT':
                        """
                        if the POS of the word is 'DT'
                        """
                        if word in dt_tl_list:
                            """
                            if the word is in the Tagalog list of nouns
                            """
                            temp_index = dt_tl_list.index(word)
                            if dt_il_list[temp_index][0] == 'None':
                                sen_translation.append(word)
                            else:
                                sen_translation.append(dt_il_list[temp_index][0])
                        else:
                            sen_translation.append(word)
                            
                    else:
                        sen_translation.append(word)
            
            wp_index += 1
        sp_index += 1
        sen_translation_list.append(sen_translation)
    
    return sen_translation_list

"""
    putting the tokens together in one sentence
"""
sen_translation_list = translate(dict_test_doc['POS'])
temp_sen_list = combine_tokens(sen_translation_list)
    
dict_op_ex = pd.DataFrame({'Source Text': cleaned_test_doc, 'System Output': temp_sen_list})

In [26]:
dict_op_ex.head()

Unnamed: 0,Source Text,System Output
0,at pinatay niya ang handog na susunugin,ket pinatay niya ang daton na mapuuran
1,sa gayoy si husai na kaibigan ni david ay puma...,cadagiti casta ni cusai na kaibigan ni jesse t...
2,at sinabi ng hari sa kaniya makailang ipasusum...,ket kinunana ng ari cadagiti kencuana makailan...
3,na iyong itakuwil ang gawa ng iyong mga kaaway,nga itakuwil ang gawa ng padam mga rabii
4,itoy sa mga nangaghihimagsik laban sa liwanag,itoy cadagiti sumukir laban cadagiti lawag


In [27]:
parsed_test_doc = target_op.split("\n")
cleaned_target_op = [remove_punct(word) for word in parsed_test_doc]
tokenized_target_op = [tokenize(word) for word in cleaned_target_op]
combine_tokens_target_op = combine_tokens(tokenized_target_op)

dict_op_ex['Target Output'] = combine_tokens_target_op

In [28]:
dict_op_ex.head()

Unnamed: 0,Source Text,System Output,Target Output
0,at pinatay niya ang handog na susunugin,ket pinatay niya ang daton na mapuuran,ket pinatayna ti daton a mapuuran
1,sa gayoy si husai na kaibigan ni david ay puma...,cadagiti casta ni cusai na kaibigan ni jesse t...,iti casta ni cusa a gayyem ni david immay iti ...
2,at sinabi ng hari sa kaniya makailang ipasusum...,ket kinunana ng ari cadagiti kencuana makailan...,ket ti ari kinunana kencuana maminano a daras ...
3,na iyong itakuwil ang gawa ng iyong mga kaaway,nga itakuwil ang gawa ng padam mga rabii,tapno umsiem ti aramid dagiti imam
4,itoy sa mga nangaghihimagsik laban sa liwanag,itoy cadagiti sumukir laban cadagiti lawag,dagitoy isuda dagiti sumukir iti lawag


In [29]:
import json

dict_tl_il_result = dict_op_ex.to_dict('records')

try:
    with open("../../src/json data/Tagalog to Ilokano/Standard Translator/dict_tl_il_test.json", "w") as outfile:
        json.dump(dict_tl_il_result, outfile)
    print("successfully saved the dict_tl_il_result.json file")
except:
    print("Error in saving the dict_tl_il_result.json file")

successfully saved the dict_tl_il_result.json file
