## Initialization

In [None]:
import pandas as pd

# Opening the file
sample_tl_raw = open("src/text data/Bible_Tagalog.txt").read()

# Printing the Ilokano Raw Data
#sample_tl_raw

In [None]:
# Splitting the raw data into sentences
parsed_sp_tl_raw = sample_tl_raw.split("\n")

#parsed_sp_tl_raw 

In [None]:
dict_sm_tl = pd.DataFrame(parsed_sp_tl_raw, columns = ['Sentence'])

# Printing the first 5 rows of the DataFrame
dict_sm_tl.head()

## Viterbi Algorithm

In [None]:
# Opening the determiners file
tl_determiners = open("src/text data/TL_Determiners.txt").read()

tl_determiners

In [None]:
# Parsing the determiners file
parsed_tl_dtmn = tl_determiners.split("\n")

parsed_tl_dtmn

## Cleaning the Data

In [None]:
import string

def remove_punct(pText):
    text_nopumct = "".join([char for char in pText if char not in string.punctuation])
    return text_nopumct

cleaned_sp_tl = [remove_punct(word) for word in parsed_sp_tl_raw]

#cleaned_sp_tl[0:500]

In [None]:
import re

def tokenize(text):
    tokens = re.split('\W+', text.lower())
    return tokens


tokenized_sp_tl = [tokenize(word) for word in parsed_sp_tl_raw]

dict_sm_tl['Tokenized'] = tokenized_sp_tl
dict_sm_tl.head()

In [None]:
def check_dtmn(sentence, dtmn_list):
    """
    This function checks if the specific word in the sentence is a determiner, and extracts it.
    """
    text = [word for word in sentence if word in dtmn_list]
    
    return text


dict_sm_tl['Determiner'] = dict_sm_tl['Tokenized'].apply(lambda x: check_dtmn(x, parsed_tl_dtmn))

dict_sm_tl.head()

## Affixes

In [None]:
""" 
    Affixes
"""
PREFIX_SET = [
    'nakikipag', 'pakikipag',
    'pinakama', 'pagpapa',
    'pinagka', 'panganga',
    'makapag', 'nakapag',
    'tagapag', 'makipag',
    'nakipag', 'tigapag',
    'pakiki', 'magpa',
    'napaka', 'pinaka',
    'ipinag', 'pagka',
    'pinag', 'mapag',
    'mapa', 'taga',
    'ipag', 'tiga',
    'pala', 'pina',
    'pang', 'naka',
    'nang', 'mang',
    'sing', 'ma', # 'ma' is a prefix in Tagalog for Adjectives, Adverbs, and Verbs
    'ipa', 'pam',
    'pan', 'pag',
    'tag', 'mai',
    'mag', 'nam',
    'nag', 'man',
    'may', 
    'na', 'ni',
    'pa', 'ka',
    'um', 'in',
    'i', 'nagpa', 
    'magka', 'nagka',
    'ini'    
]

Adj_Prefix = [
    'ma'
]

INFIX_SET = [
    'um', 'in',
]

SUFFIX_SET = [
    'syon','dor',
    'ita', 'han',
    'hin', 'ing',
    'ang', 'ng',
    'an', 'in',
    'g',
]

PREPO_SET = [
    "sumasa", 'gitna',
    'ibabaw', 'ilalim',
    'itaas', 'ibabaw',
]

PER_PRONOUN = [
    'ako', 'ikaw', 'siya', 'kami', 'kayo', 'sila',
    'akong', 'siyang', 'kaming', 'kayong', 'silang'
    'ko', 'akin', 'sakin', 'amin', 'atin', 'inyo',
    'kong', 'inyong',
    'kata', 'mo', 'kanila', 'kanya', 'namin', 'natin',
    'katang', 'mong', 'kanilang', 'kanyang'
    'ninyo', 'niya', 'kayoy', 'ikay', 'akoy', 'siyay', 'kamiy',
    'ninyong', 'niyang',
    'silay', 'inyoy', 'kanilay', 'kanyay', 'niyay'
]

noun_dtmn_list = ["ang", "ng", "mga", "si", "ay", "ni"] # Noun Determiners

adv_dtmn_list = ["nang"]

prepo_dtmn_list = ["sa"]

conj_list = ['at', 'o', 'saka', 'ngunit', 'datapwat']

vowels = ['a', 'e', 'i', 'o', 'u']

adv_time_list = ['mamaya', 'ngayon', 'kahapon', 'bukas'] 

## Verb Affixer Checker

In [None]:
def check_verb_affixes(word, prev2_word, prev_word, next_word, prefix_list, infix_list, suffix_list, isDone, hasVerbAffixes):
    """
    This function checks if the specific word in the sentence has an affix, and extracts it.
    """
    for prefix in prefix_list:
        if word.startswith(prefix) and not isDone:
            if word.startswith("mag"):
                if  word[3:5] == word[5:7] and not isDone:
                    """
                    verbs starting with "mag" always repeat the next 4 letters of the word e.g. maglalakad, maglalaro, magbibihis
                    """
                    hasVerbAffixes = True
                    isDone = True

                if word[3] in (vowels):
                    """
                    verbs starting with "mag" and if the next letter is a vowel, the vowel is repeated e.g. magiikot, magaayos, maguusap
                    """
                    if word[3] == word[4] and not isDone:
                        hasVerbAffixes = True
                        isDone = True
            else:
                hasVerbAffixes = True
                isDone = True
                
    for infix in infix_list:
        if word.__contains__(infix) and not isDone:
            hasVerbAffixes = True
            isDone = True
            
    for suffix in suffix_list:
        """
        words ending with 'ang' are adverbs and after the adverbs are the nouns 
        """
        if word.endswith(suffix) and not isDone and not word.endswith("ang") and not prev_word.endswith("ang"):
            hasVerbAffixes = True
            isDone = True

    if len(word) >= 4:
        if word[:2] == word[2:4] and not isDone:
            """
            if the first four characters of a word is repeated, then it is a verb
            """
            hasVerbAffixes = True
            isDone = True
    
    return hasVerbAffixes
# end of check_verb_affixes()

## Verb Tagger

In [None]:
def tag_verb(sentence, dtmn_list, prepo_list, pronoun_list):
    """
    This function tags if the specific word in the sentence is a verb, and extracts it.
    """
    
    """
    verb = [word for word in sentence if word in verb_list]
    """
    verb = []
    prev_word = ""
    prev2_word = ""
              
    for word in sentence:
        isDone = False
        hasVerbAffixes = False
                
        try:
            next_word = sentence[sentence.index(word) + 1]
        except (ValueError, IndexError):
            next_word = None
            
            
        """
        checks if the word has an affix/es
        """
        
        try:
            hasVerbAffixes = check_verb_affixes(word, prev2_word, prev_word, next_word, PREFIX_SET, INFIX_SET, SUFFIX_SET, isDone, hasVerbAffixes)
        except (ValueError, IndexError):
            hasVerbAffixes = False
        
        
        if word not in (dtmn_list + prepo_list + pronoun_list + conj_list):
            if prev_word not in (noun_dtmn_list + adv_dtmn_list + prepo_dtmn_list): 
                if next_word in (noun_dtmn_list): 
                    """
                    if the previous word is not in the noun, adverb, and preposition determiner and 
                    the next word is a noun determiner
                    """
                    if hasVerbAffixes:
                        """
                        if the current word has a verb affix/es, then it is a verb
                        """
                        verb.append(word)
                        isDone = True
                
                if next_word in pronoun_list:
                    verb.append(word)
                    isDone = True

            if prev_word == "ay" and not isDone:
                if next_word in ("ng", "sa", "nang"):
            # if prev_word in ('ay', 'ng', 'mga') and not word.endswith('ng') and hasAffixes and not isDone:
                    """
                    if the previous word is 'ay' and the next word is 'ng' or 'sa', then it is a verb
                    """
                    verb.append(word)
                    isDone = True

            if word and not isDone:
                if word[:5] in ("magpa", "nagka") or word[:4] in ("napa", "naka") or word[:3] in ("nag"):
                # if hasAffixes and not isDone:
                    """
                    if the first five characters of a word start with "magpa" or "nagka", then it is a verb
                    """
                    verb.append(word)
                    isDone = True
                if word[:3] in ("mag"):
                    if next_word in (PER_PRONOUN, "sa", "ni", "nang"):
                        verb.append(word)
                        isDone = True
        
        if hasVerbAffixes and prev_word == None and not isDone:
            if next_word in pronoun_list or (next_word in dtmn_list and next_word not in ('ng', 'mga')):
                verb.append(word)
                isDone = True
               
        prev_word = word
        
        try:
            prev2_word = sentence[sentence.index(word) - 1]
        except (ValueError, IndexError):
            prev2_word = None
        
    return verb
# end of function

dict_sm_tl['Verb'] = dict_sm_tl['Tokenized'].apply(lambda x: tag_verb(x, parsed_tl_dtmn, PREPO_SET, PER_PRONOUN))
dict_sm_tl.head(30)

## Noun Tagger

In [None]:
def tag_noun(sentence, per_noun_list):
    """
    This function tags if the specific word in the sentence is a noun, and extracts it.
    """
    noun = []
    prev_word = ""
    prev2_word = ""
    adj_prefix = ["ika", "pinaka", "pang"]
    adj_suffix = ["ng"]
    
    
    for word in sentence:
        """
        marks if the word is already tagged
        """
        isDone = False
        
        """
        gets the next word in the sentence
        """
        try:
            next_word = sentence[sentence.index(word) + 1]
        except (ValueError, IndexError):
            next_word = None
        
        
        if word in per_noun_list:
            """
            if the word is a personal pronoun, then it is a noun
            """
            noun.append(word)
            isDone = True
            
        if len(sentence) == 1:
            """
            if the sentence contains only one word, then it is a noun
            """
            noun.append(word)
            isDone = True
    
        if prev_word in (noun_dtmn_list) and word not in noun_dtmn_list and not isDone:
            """
            if the previous word is a determiner and the word is not a determiner, then it is a noun
            """
            isAdj = False
            isVerb = False
            
            if prev_word == 'ay' and next_word == 'ng':
                """
                if the previous word is 'ay' and the next word is 'ng', then it is a verb
                """
                isVerb = True
                
            if word.endswith("ng") and len(word.replace("ng", "")) > 3:
                """
                if the word ends with 'ng' and length of the word when 'ng' is removed is greater than 3, then it is an adjective
                """
                isAdj = True
            
            if not isVerb and not isAdj:
                for prefix in adj_prefix:
                    """
                    if the word is an adjective it has an adjective prefix
                    """
                    if not isDone:
                        isAdj = word.startswith(prefix)
                    if not isAdj and not isDone:
                        if next_word != 'ng':
                            noun.append(word)
                            isDone = True
                    if isAdj:  
                        isDone = True
                    
        if prev_word.startswith('pang') and prev_word.endswith('ng') and not isDone:
            """
            if the previous word is an adjective and if the next word is not an adjective
            then the word is a noun
            """
            noun.append(word)
            isDone = True
            
            
        if prev2_word == "ay" and prev_word.endswith("ang") and word not in noun_dtmn_list and not isDone:
            """
            if the previous previous word is "ay" and the previous word is "ang" 
            and the word is not a determiner then the word is a noun
            """
            noun.append(word)
            isDone = True
        
        
        """
        getting the previous word
        """
        prev_word = word
        
        
        """
        getting the previous after the previous word
        """
        try:
            prev2_word = sentence[sentence.index(word) - 1]
        except (ValueError, IndexError):
            prev2_word = None
        
    return noun
# end of function

dict_sm_tl['Noun'] = dict_sm_tl['Tokenized'].apply(lambda x: tag_noun(x, PER_PRONOUN))
dict_sm_tl.head(30)

## Adjective Tagger

In [None]:
def tag_adj(sentence, dtmn_list, prepo_list, pronoun_list):
    """
    This function tags if the specific word in the sentence is an adjective, and extracts it.
    """
    adj = []
    prev_word = ""
    prev2_word = ""
    
    for word in sentence:
        """
        marks if the word is already tagged
        """
        isDone = False
        hasVerbAffixes = False
        
        """
        gets the next word in the sentence
        """
        try:
            next_word = sentence[sentence.index(word) + 1]
        except (ValueError, IndexError):
            next_word = None
        
        """
        checks if the word is has an verb affix/es
        """
        try:
            hasVerbAffixes = check_verb_affixes(word, prev2_word, prev_word, next_word, PREFIX_SET, INFIX_SET, SUFFIX_SET, isDone, hasVerbAffixes)
        except (ValueError, IndexError):
            hasVerbAffixes = False
            
        if word not in (dtmn_list + prepo_list + pronoun_list + conj_list):
            if word.startswith("ma") and (next_word in noun_dtmn_list or next_word == 'na') and next_word not in ('ay', 'ng', 'mga') and  not hasVerbAffixes and not isDone:
                """
                if the word is an adjective it has an adjective prefix 'ma' and the next word is noun determiner
                eg. maayos na ang kalsada
                """
                adj.append(word)
                isDone = True
                
            if next_word == 'ang' and not hasVerbAffixes and prev_word not in noun_dtmn_list and not isDone:
                """
                if the next word is 'ng' then the word is an adjective
                """
                adj.append(word)
                isDone = True
            
            if word.endswith("ng") and not hasVerbAffixes and not isDone:
                """
                if the word ends with 'ng', then it is an adjective
                eg. dalawang bahay
                """
                adj.append(word)
                isDone = True
            
            if prev_word in ('ay', 'na') and not prev2_word.startswith('ika') and (not hasVerbAffixes or word.startswith('ma')) and not isDone: # tinanggal ko muna yung "and not hasVerbAffixes"
                """
                if the previous word is 'ay' or 'na', then it is an adjective
                eg. salamin na parihaba
                """
                adj.append(word)
                isDone = True
            
            
        """
        getting the previous word
        """
        prev_word = word
        
        
        
        """
        getting the previous after the previous word
        """
        try:
            prev2_word = sentence[sentence.index(word) - 1]
        except (ValueError, IndexError):
            prev2_word = None
            
    return adj
# end of function

dict_sm_tl['Adjective'] = dict_sm_tl['Tokenized'].apply(lambda x: tag_adj(x, parsed_tl_dtmn, PREPO_SET, PER_PRONOUN))
dict_sm_tl.head(30)

## Adverb Tagger

In [None]:
def tag_adv(sentence):
    """
    This function tags if the specific word in the sentence is an adverb, and extracts it.
    """
    adverb = []
    prev_word = ""
    prev2_word = ""
    next_word = ""
    
    for word in sentence:
        """
        marks if the word is already tagged
        """
        isDone = False
        hasVerbAffixes = False
        
        """
        gets the next word in the sentence
        """
        try:
            next_word = sentence[sentence.index(word) + 1]
        except (ValueError, IndexError):
            next_word = None
            
        """
        checks if the word is has an verb affix/es
        """
        try:
            hasVerbAffixes = check_verb_affixes(word, prev2_word, prev_word, next_word, PREFIX_SET, INFIX_SET, SUFFIX_SET, isDone, hasVerbAffixes)
        except (ValueError, IndexError):
            hasVerbAffixes = False
            
        if word not in PER_PRONOUN:
            if word.startswith('ma') and not word.startswith('mag') and (next_word in PER_PRONOUN or next_word == 'na') and next_word not in ('ay', 'ng', 'mga') and not isDone:
                """
                if the word is an adverb it has an adverb prefix 'ma' and the next word is a pronoun
                eg. mabilis na magsulat
                """
                adverb.append(word)
                isDone = True
            
            if prev_word == 'nang' and (not hasVerbAffixes or word.startswith('ma')) and next_word not in ('ay', 'ng', 'mga') and not isDone:
                """
                if the previous word is 'nang' and starts with 'ma' or not have verb affixes and next word is not "ay, ng, or mga", then it is an adverb
                eg. tumalon nang mataas
                """
                adverb.append(word)
                isDone = True
                
            if word in adv_time_list and not isDone:
                """
                if the word is an adverb of time, then it is an adverb
                eg. aalis bukas
                """
                adverb.append(word)
                isDone = True
                
            if next_word == 'na' and not hasVerbAffixes and not isDone:
                """
                if the next word is 'na' then the word is an adverb
                eg. tunay na maganda
                """
                adverb.append(word)
                isDone = True
            
            if prev_word.startswith('ma') and not prev_word.startswith('mag') and (hasVerbAffixes or word.startswith('mag')) and not isDone:
                """
                if the previous word is an adverb the word is a verb
                eg. mabagal magpalit
                """
                adverb.append(prev_word)
                isDone = True
        
        
        """
        getting the previous word
        """
        prev_word = word
        
        
        """
        getting the previous after the previous word
        """
        try:
            prev2_word = sentence[sentence.index(word) - 1]
        except (ValueError, IndexError):
            prev2_word = None
                
    return adverb
# end of function

dict_sm_tl['Adverb'] = dict_sm_tl['Tokenized'].apply(lambda x: tag_adv(x))
dict_sm_tl.head(30)

## Tester

In [None]:
# temp_sen = dict_sm_tl['Tokenized'].array[7]
# temp_sen = tokenize("iniligay ng guro ang libro sa mesa")
temp_sen = tokenize("mahina magsalita")
print(temp_sen)

# tagged_sen = tag_verb(temp_sen, parsed_tl_dtmn, PREPO_SET, PER_PRONOUN) # verb tagger
# tagged_sen = tag_noun(temp_sen, PER_PRONOUN) # noun tagger
# tagged_sen = tag_adj(temp_sen, parsed_tl_dtmn, PREPO_SET, PER_PRONOUN) # adjective tagger
tagged_sen = tag_adv(temp_sen) # adverb tagger
print(tagged_sen)
