## Initialization

In [None]:
import pandas as pd

# Opening the file
sample_tl_raw = open("src/text data/Bible_Tagalog.txt").read()

In [None]:
# Splitting the raw data into sentences
parsed_sp_tl_raw = sample_tl_raw.split("\n")

In [None]:
dict_sm_tl = pd.DataFrame(parsed_sp_tl_raw, columns = ['Sentence'])

# Printing the first 5 rows of the DataFrame
dict_sm_tl.head()

## Cleaning the Data

### Removing Punctuation/s

In [None]:
import string

def remove_punct(pText):
    text_nopumct = "".join([char for char in pText if char not in string.punctuation])
    return text_nopumct

cleaned_sp_tl = [remove_punct(word) for word in parsed_sp_tl_raw]

#cleaned_sp_tl[0:500]

### Tokenizing

In [None]:
import re

def tokenize(text):
    tokens = re.split('\W+', text.lower())
    return tokens


tokenized_sp_tl = [tokenize(word) for word in parsed_sp_tl_raw]

dict_sm_tl['Tokenized'] = tokenized_sp_tl
dict_sm_tl.head()

## Affixes Set

In [None]:
""" 
    Affixes
"""
PREFIX_SET = [
    'nakikipag', 'pakikipag',
    'pinakama', 'pagpapa',
    'pinagka', 'panganga',
    'makapag', 'nakapag',
    'tagapag', 'makipag',
    'nakipag', 'tigapag',
    'pakiki', 'magpa',
    'napaka', 'pinaka',
    'ipinag', 'pagka',
    'pinag', 'mapag',
    'mapa', 'taga',
    'ipag', 'tiga',
    'pala', 'pina',
    'pang', 'naka',
    'nang', 'mang',
    'sing', 'ma', # 'ma' is a prefix in Tagalog for Adjectives, Adverbs, and Verbs
    'ipa', 'pam',
    'pan', 'pag',
    'tag', 'mai',
    'mag', 'nam',
    'nag', 'man',
    'may', 
    'na', 'ni',
    'pa', 'ka',
    'um', 'in',
    'i', 'nagpa', 
    'magka', 'nagka',
    'ini'    
]

Adj_Prefix = [
    'ma'
]

INFIX_SET = [
    'um', 'in',
]

SUFFIX_SET = [
    'syon','dor',
    'ita', 'han',
    'hin', 'ing',
    'ang', 'ng',
    'an', 'in',
    'g',
]

PREPO_SET = [
    'gitna',            #removed "sumasa", transferred to prepo_dtmn_list since it is often placed before prepositions
    'ibabaw', 'ilalim',
    'itaas', 'ibaba', 
    'baba', 'taas',
    'harap', 'likod', 
    'labas', 'loob',
    'pagitan', 'unahan', 
    'dulo', 'tabi'
]

CONJ_SET = [
    'at', 'bali', 
    'dahil', 'datapwat', 
    'habang', 'kahit', 
    'kapag', 'kasi', 
    'kaso', 'kaya', 
    'kaysa', 'nang',
    'na', 'ngunit', 
    'ni',  'o', 
    'para', 'pati', 
    'pero', 'porket', 
    'saka', 'samantala', 
    'subalit', 'tsaka', 
    'tuwing', 'upang' 
]

PER_PRONOUN = [
    'ako', 'ikaw', 'siya', 'kami', 'kayo', 'sila',
    'akong', 'siyang', 'kaming', 'kayong', 'silang'
    'ko', 'akin', 'sakin', 'amin', 'atin', 'inyo',
    'kong', 'inyong',
    'kata', 'mo', 'kanila', 'kanya', 'namin', 'natin',
    'katang', 'mong', 'kanilang', 'kanyang'
    'ninyo', 'niya', 'kayoy', 'ikay', 'akoy', 'siyay', 'kamiy',
    'ninyong', 'niyang',
    'silay', 'inyoy', 'kanilay', 'kanyay', 'niyay',
    'tayo'
]

vowels = ['a', 'e', 'i', 'o', 'u']

noun_dtmn_list = ["ang", "ng", "mga", "si", "ay", "ni", "sa"] # Noun Determiners

adv_dtmn_list = ["nang"]

prepo_dtmn_list = ["sa", "nasa", "sumasa"]

adv_time_list = ['mamaya', 'ngayon', 'kahapon', 'bukas'] 

### Determiner Tagger

In [None]:
def isDtmn(word):
    """
    This function checks if the specific word in the sentence is a determiner, and extracts it.
    """
    if word in (noun_dtmn_list + adv_dtmn_list + prepo_dtmn_list + adv_time_list):
        ans = True
    else:
        ans = False

    return ans


# Viterbi Algorithm

## Verb Affixer Checker

In [None]:
def check_verb_affixes(word, prev2_word, prev_word, next_word, isTagged, hasVerbAffixes):
    """
    This function checks if the specific word in the sentence has an affix, and extracts it.
    """
    for prefix in PREFIX_SET:
        if word.startswith(prefix) and not isTagged:
            if word.startswith("mag"):
                if  word[3:5] == word[5:7] and not isTagged:
                    """
                    verbs starting with "mag" always repeat the next 4 letters of the word 
                    e.g. maglalakad, maglalaro, magbibihis
                    """
                    hasVerbAffixes = True
                    isTagged = True

                if word[3] in (vowels):
                    """
                    verbs starting with "mag" and if the next letter is a vowel, the vowel is repeated 
                    e.g. magiikot, magaayos, maguusap
                    """
                    if word[3] == word[4] and not isTagged:
                        hasVerbAffixes = True
                        isTagged = True
            else:
                hasVerbAffixes = True
                isTagged = True
                
    for infix in INFIX_SET:
        if word.__contains__(infix) and not isTagged:
            hasVerbAffixes = True
            isTagged = True
            
    for suffix in SUFFIX_SET:
        """
        words ending with 'ang' are adverbs and after the adverbs are the nouns 
        """
        if word.endswith(suffix) and not isTagged and not word.endswith("ang") and not prev_word.endswith("ang"):
            hasVerbAffixes = True
            isTagged = True

    if len(word) >= 4:
        if word[:2] == word[2:4] and not isTagged:
            """
            if the first four characters of a word is repeated, then it is a verb
            """
            hasVerbAffixes = True
            isTagged = True
    
    return hasVerbAffixes
# end of check_verb_affixes()

## Verb Checker


In [None]:
def isVerb(word, prev_word, next_word, hasVerbAffixes):
    """
    This function tags if the specific word in the sentence is a verb, and extracts it.
    """
    isDone = False
    isVerb = False
    if word not in (PREPO_SET + PER_PRONOUN + CONJ_SET):
        if prev_word not in (noun_dtmn_list + adv_dtmn_list + prepo_dtmn_list): 
            if next_word in (noun_dtmn_list): 
                """
                if the previous word is not in the noun, adverb, and preposition determiner and 
                the next word is a noun determiner
                eg. !(sayaw ng bata)
                """
                if hasVerbAffixes:
                    """
                    if the current word has a verb affix/es, then it is a verb
                    """
                    isVerb = True
                    isDone = True
            
            if next_word in PER_PRONOUN:
                """
                if the next word is a personal pronoun
                eg. sayaw ka
                issue: if the next word is a personal pronoun, it is not always a verb
                eg. bastos ka
                """
                isVerb = True
                isDone = True

        if prev_word == "ay" and not isDone:
            if next_word in ("ng", "sa", "nang"):
                """
                if the previous word is 'ay' and the next word is 'ng' or 'sa', then it is a verb
                eg. ay naglalakad na bata
                """
                isVerb = True
                isDone = True

        if word and not isDone:
            if word[:5] in ("magpa", "nagka") or word[:4] in ("napa", "naka") or word[:3] in ("nag"):
            # if hasAffixes and not isDone:
                """
                if the first five characters of a word start with "magpa" or "nagka", then it is a verb
                eg. magpapakain, nagkakasakit
                """
                isVerb = True
                isDone = True
            if word[:3] in ("mag"):
                if next_word in (PER_PRONOUN, "sa", "ni", "nang"):
                    """
                    if the first three characters of a word start with "mag", then it is a verb
                    eg. mag-ayos ka
                    """
                    isVerb = True
                    isDone = True
    
    if hasVerbAffixes and prev_word == None and not isDone:
        if next_word in PER_PRONOUN or (next_word in noun_dtmn_list and next_word not in ('ng', 'mga')):
            """
            Isinulat niya
            """
            isVerb = True
            isDone = True
            
    return isVerb
# end of function

## Noun Tagger

In [None]:
def isNoun(word, prev_word, prev2_word, next_word, sentence_length):
    """
    This function tags if the specific word in the sentence is a noun, and extracts it.
    """
    isDone = False
    isNoun = False
    adj_prefix = ["ika", "pinaka", "pang"]
    adj_suffix = ["ng"]
    
    if word in PER_PRONOUN:
        """
        if the word is a personal pronoun, then it is a noun
        """
        isNoun = True
        isDone = True
        
    # if sentence_length == 1:
    #     """
    #     if the sentence contains only one word, then it is a noun
    #     """
    #     isNoun = True
    #     isDone = True

    if prev_word in (noun_dtmn_list) and word not in noun_dtmn_list and not isDone:
        """
        if the previous word is a determiner and the word is not a determiner, then it is a noun
        eg. !(ng mga)
        """
        isAdj = False
            
        if word.endswith("ng") and len(word.replace("ng", "")) > 3:
            """
            if the word ends with 'ng' and length of the word when 'ng' is removed is greater than 3, then it is an adjective
            """
            isAdj = True
        
        if not isAdj:
            for prefix in adj_prefix:
                """
                if the word is an adjective it has an adjective prefix
                """
                if not isDone:
                    isAdj = word.startswith(prefix)
                if not isAdj and not isDone:
                    if next_word != 'ng':
                        isNoun = True
                        isDone = True
                if isAdj:  
                    isDone = True
                
    if prev_word.startswith('pang') and prev_word.endswith('ng') and not isDone:
        """
        if the previous word is an adjective and if the next word is not an adjective
        then the word is a noun
        eg. pangunang araw
        """
        isNoun = True
        isDone = True
        
        
    if prev2_word == "ay" and prev_word.endswith("ang") and word not in noun_dtmn_list and not isDone:
        """
        if the previous previous word is "ay" and the previous word is "ang" 
        and the word is not a determiner then the word is a noun
        eg. ay ang bata
        iss
        """
        isNoun = True
        isDone = True
            
    return isNoun
# end of function

## Adjective Tagger

In [None]:
def isAdj(word, prev_word, prev2_word, next_word, hasVerbAffixes):
    """
    This function tags if the specific word in the sentence is an adjective, and extracts it.
    """
    isDone = False
    isAdj = False
        
    if word not in (noun_dtmn_list + adv_dtmn_list + prepo_dtmn_list + PREPO_SET + PER_PRONOUN + CONJ_SET):
        if word.startswith("ma") and (next_word in noun_dtmn_list or next_word == 'na') and next_word not in ('ay', 'ng', 'mga') and  not hasVerbAffixes and not isDone:
            """
            if the word is an adjective it has an adjective prefix 'ma' and the next word is noun determiner
            eg. maayos na ang kalsada
            """
            isAdj = True
            isDone = True
            
        if next_word == 'ang' and not hasVerbAffixes and prev_word not in noun_dtmn_list and not isDone:
            """
            if the next word is 'ng' then the word is an adjective
            eg. ang ganda
            """
            isAdj = True
            isDone = True
        
        if word.endswith("ng") and not hasVerbAffixes and not isDone:
            """
            if the word ends with 'ng', then it is an adjective
            eg. dalawang bahay
            """
            isAdj = True
            isDone = True
        
        if prev_word in ('ay', 'na') and not prev2_word.startswith('ika') and (not hasVerbAffixes or word.startswith('ma')) and not isDone:
            """
            if the previous word is 'ay' or 'na', then it is an adjective
            eg. salamin na parihaba
            """
            isAdj = True
            isDone = True
            
    return isAdj
# end of function

### Palindrome Checker

In [None]:
def isPalindrome(word): 
    """
    This function checks if the word is a palindrome.
    """
    
    """
    gets the half length of the word
    """
    half_len = len(word)/2
    half_len = int(half_len)
    
    if word[:half_len] == word[half_len:] and half_len > 2:
        return True
    else:
        return False

## Adverb Tagger

In [None]:
def isAdv(word, prev_word, next_word, hasVerbAffixes):
    """
    This function tags if the specific word in the sentence is an adverb, and extracts it.
    """
    isDone = False
    isAdv = False
    
    if word not in PER_PRONOUN:
        if word.startswith('ma') and not word.startswith('mag') and (next_word in PER_PRONOUN or next_word == 'na') and next_word not in ('ay', 'ng', 'mga') and not isDone:
            """
            if the word is an adverb it has an adverb prefix 'ma' and the next word is a pronoun
            eg. mabilis na magsulat
            """
            isAdv = True
            isDone = True
        
        if prev_word == 'nang' and (not hasVerbAffixes or word.startswith('ma')) and next_word not in ('ay', 'ng', 'mga') and not isDone:
        # if prev_word == 'nang' and (not hasVerbAffixes or (word.startswith('ma') and not word.startswith('mag'))) and next_word not in ('ay', 'ng', 'mga') and not isDone:
            """
            if the previous word is 'nang' and starts with 'ma' or not have verb affixes and next word is not "ay, ng, or mga", then it is an adverb
            eg. tumalon nang mataas
            """
            isAdv = True
            isDone = True
            
        if word in adv_time_list and not isDone:
            """
            if the word is an adverb of time, then it is an adverb
            eg. aalis bukas
            """
            isAdv = True
            isDone = True
            
        if next_word == 'na' and not hasVerbAffixes and not isDone:
            """
            if the next word is 'na' then the word is an adverb
            eg. tunay na maganda
            """
            isAdv = True
            isDone = True
        
        if prev_word.startswith('ma') and not prev_word.startswith('mag') and (hasVerbAffixes or word.startswith('mag')) and not isDone:
            """
            if the previous word is an adverb the word is a verb
            eg. mabagal magpalit
            """
            isAdv = True
            isDone = True
            
        if isPalindrome(word) and not isDone:
            """
            if the word is a palindrome then it is an adverb
            eg. dahandahan (dahan-dahan) siya
            """
            isAdv = True
            isDone = True
        
        if word.__contains__('ng') and not isDone:
            """
            if the word contains 'ng' then it is an adverb
            """
            
            temp_word = word.replace('ng', '')
            
            if isPalindrome(temp_word):
                """
                if the temporary word is a palindrome then it is an adverb
                eg. sobrangsobra (sobrang-sobra) siya
                """
                isAdv = True
                isDone = True
                       
    return isAdv
# end of function

## Preposition Tagger

In [None]:
def isPrepo(word, prev_word):
    """
    This function checks if the specific word in the sentence is a preposition, and extracts it.
    """
    isPrepo = False
    prev_word = ""
    
    
    if prev_word in (prepo_dtmn_list) and word in (PREPO_SET):
        isPrepo = True
        
    return isPrepo
# end of function

## Conjunction Tagger

In [None]:
def isConj(word):
    """
    This function checks if the specific word in the sentence is a conjunction
    """
    if word in CONJ_SET:
        return True
    else:
        return False
# end of function

# Word Getter

In [None]:
isTagged = None
dtmn_sen_list = []
verb_sen_list = []
noun_sen_list = []
adj_sen_list = []
adv_sen_list = []
prepo_sen_list = []
conj_sen_list = []
sentence_list = dict_sm_tl['Tokenized']
"""
instantiations of the variables
"""

for sentence in sentence_list:
    dtmn_list = []
    verb_list = []
    noun_list = []
    adj_list = []
    adv_list = []
    prepo_list = []
    conj_list = []
    prev_word = ""
    prev2_word = ""
    sen_len = len(sentence)
    """
    instantiations of the variables
    """
    
    for word in sentence:
        
        isTagged = False
        hasVerbAffixes = False
        """
        instantiations of the variables
        """
        
        try:
            next_word = sentence[sentence.index(word) + 1]
        except (ValueError, IndexError):
            next_word = None
        """
        gets the next word in the sentence
        """
        
        if isDtmn(word):
            """
            checks if the word is a determiner
            """
            dtmn_list.append(word)
            isTagged = True
            
        try:
            hasVerbAffixes = check_verb_affixes(word, prev2_word, prev_word, next_word, isTagged, hasVerbAffixes)
        except (ValueError, IndexError):
            hasVerbAffixes = False
        """
        checks if the word has verb affixes
        """
            
        if isVerb(word, prev_word, next_word, hasVerbAffixes) and not isTagged:
            """
            checks if the word is a verb and not tagged
            """
            verb_list.append(word)
            isTagged = True
            
        if isNoun(word, prev_word, prev2_word, next_word, sen_len) and not isTagged:
            """
            checks if the word is a noun and not tagged
            """
            noun_list.append(word)
            isTagged = True
            
        if isAdj(word, prev_word, prev2_word, next_word, hasVerbAffixes) and not isTagged:
            """
            checks if the word is an adjective and not tagged
            """
            adj_list.append(word)
            isTagged = True
            
        if isAdv(word, prev_word, next_word, hasVerbAffixes) and not isTagged:
            """
            checks if the word is an adverb and not tagged
            """
            adv_list.append(word)
            isTagged = True
            
        if isPrepo(word, prev_word) and not isTagged:
            """
            checks if the word is a preposition and not tagged
            """
            prepo_list.append(word)
            isTagged = True
        
        if isConj(word) and not isTagged:
            """
            checks if the word is a conjunction and not tagged
            """
            conj_list.append(word)
            isTagged = True
        
        prev_word = word
        """
        getting the previous word
        """
        
        try:
            prev2_word = sentence[sentence.index(word) - 1]
        except (ValueError, IndexError):
            prev2_word = None
        """
        getting the previous after the previous word
        """
            
    dtmn_sen_list.append(dtmn_list)
    verb_sen_list.append(verb_list)
    noun_sen_list.append(noun_list)
    adj_sen_list.append(adj_list)
    adv_sen_list.append(adv_list)
    prepo_sen_list.append(prepo_list)
    conj_sen_list.append(conj_list)
    """
    storing the words in the list to the list of sentences
    """

# dict_sm_tl['Determiner'] = dtmn_sen_list
dict_sm_tl['Verb'] = verb_sen_list
# dict_sm_tl['Noun'] = noun_sen_list
# dict_sm_tl['Adjective'] = adj_sen_list
# dict_sm_tl['Adverb'] = adv_sen_list
# dict_sm_tl['Preposition'] = prepo_sen_list
# dict_sm_tl['Conjunction'] = conj_sen_list
dict_sm_tl.head(30)

    

## Tester

In [None]:
temp_sen = dict_sm_tl['Tokenized'][9]
temp_verb = dict_sm_tl['Verb'][9]

print(temp_sen)
print(temp_verb)


## Importing the dictionary in the json file

In [None]:
import json

dictionary = dict_sm_tl.to_dict('records')

with open("src/json/tl_pos.json", "w") as outfile:
    json.dump(dictionary, outfile)