# Ilokano Part of Speech Tagger

## Initialization of Data Sets

### Source Data Set

In [3]:
import pandas as pd

# Opening the file
sample_il_raw = open("src/text data/Bible_Ilokano.txt").read()

# Splitting the text into sentences
parsed_sp_il_raw = sample_il_raw.split("\n")

# Creating a dataframe
dict_sm_il = pd.DataFrame(parsed_sp_il_raw, columns = ['Sentence'])

# Printing the first 5 rows of the DataFrame
dict_sm_il.head()

Unnamed: 0,Sentence
0,GENESIS
1,Ti Aramid ti Dios a Namarsua.
2,Idi punganay ti Dios pinarsuana dagiti langlan...
3,Ket ti daga adda idi a gosogoso ken awan nagya...
4,"Ket ti Dios kinunana, Adda coma lawag: ket nag..."


### Determiners Data Set

In [4]:
noun_dtmn_list = ["dagiti", "ti", "kadagiti", "ni", "ken", 'ket', "ni"] # Noun Determiners

adv_dtmn_list = ["idi", "iti"]

prepo_dtmn_list = ["ti", "adda", "addaak"]

adv_time_list = ['madamdama', 'ita', 'kalman', 'inton bigat']

### Affixes Data Set

In [6]:
PREFIX_SET = [
'na', 'ag', 'ka', 'nag', 'im',
'maki', 'panna', 'maka', 'naki', 'naka', 'nang', 'makapag',
'mang', 'agan', 'agay', 'pananga', 'agam', 'nagpa', 'magpa', 
'ipa', 'pag', 'pam', 'taga', 'i', 'napa', 'in', 'manang',
'ma' # a translation for 'ma'
'para', 'pang', 'panag', 'nai', 'manag', 'man', 'kina',
'nai', 'nai', 'nagpa' # nangi
]

Adj_Prefix =[
'ka', 
'na' # a translation for 'ma'
]

INFIX_SET = []
"""
infix sa tagalog ay prefix sa ilokano
sumigaw = inpukaw
"""

SUFFIX_SET = [
'to', 'nto', 'ak' 'en'
# no translation for syon, dor, ita, ing
'na', 'an', 'm'
]

Adj_Suffix = [
'an'
]

PREPO_SET = [
    'tengnga',
    'rabaw', 'rabao', 'baba', 'babaen', 
    'ngatuen', 'ngato', 'sirok', 'sidong',
    'sango', 'sarang', 'saklang', 'sanguanan' 'likud', 
    'ruar', 'uneg',
    'baet', 'sango', 'umuna'
    'ngudo', 'ungto', 'abay', 'igid'
]

CONJ_SET = [
    'ken', 'ket', # no  translation for word 'bali' 
    'gapu', 'ta', 'agsipud',    'laeng', 'ngem', 'nupay kasta',
    'bayat', 'uray', 
    'intono', 'no', 'ta', 'ngamin', 
    'kaso', 'gapuna', 
    'ngem', 'idi',
    'nga', 
    'ni',  'wenno', 
    'para', 'tapno', 'agraman', 
    'numpay kasta', 
    'ken', 'ket', 'kabayatanna', 'bayat', 
    'kada'
]
PER_PRONOUN = [
    'siak', 'sika', 'isu', 'dakami', 'datayo', 'dakayo', 'kayo', 'da',
    'dinak', 'diak', 'kaniak', # no translation for 'siyang' 
    'kadakami', 'kami', # -kami translation is usually connected to another word i.e., 'Maragsakankami'
    'kadakayo', 'dakayo', 'kayo', # -kayo translation is usually connected to another word i.e., 'Umaykayo'
    'ida', 'da', # -da translation is usually connected to another word i.e., 'nagtultuloyda'
    'ko', # -ko translation is usually connected to another word i.e., 'Kayatko'
    # no translation for 'sakin'
    'kukuami', 'kadatayo', 'kukuatayo', 'tayo', # -tayo translation is usually connected to another word i.e., 'Basaentayo'
    # no translation for 'kong' and 'inyong'
    'kata', 'mo', # -mo is usually connected to another word
    'kenkuana', 'mi', # -mi translation is usually connected to another word i.e., 'Insuratmi'
    'yo', 'nyo' # both are usually connected to another word
    'na', # can stand alone and can be connected to another word
]

### Other Sets

In [5]:
vowels = ['a', 'e', 'i', 'o', 'u']

## Cleaning the Data

### Removing the Punctuation/s

In [6]:
import string

def remove_punct(Text):
    text_nopumct = "".join([char for char in Text if char not in string.punctuation])
    return text_nopumct

cleaned_sp_il = [remove_punct(word) for word in parsed_sp_il_raw]


### Tokenizing

In [8]:
import re

def tokenize(text):
    tokens = re.split('\W+', text.lower())
    return tokens


tokenized_sp_tl = [tokenize(word) for word in parsed_sp_il_raw]

dict_sm_il['Tokenized'] = tokenized_sp_tl
dict_sm_il.head()

Unnamed: 0,Sentence,Tokenized
0,GENESIS,[genesis]
1,Ti Aramid ti Dios a Namarsua.,"[ti, aramid, ti, dios, a, namarsua, ]"
2,Idi punganay ti Dios pinarsuana dagiti langlan...,"[idi, punganay, ti, dios, pinarsuana, dagiti, ..."
3,Ket ti daga adda idi a gosogoso ken awan nagya...,"[ket, ti, daga, adda, idi, a, gosogoso, ken, a..."
4,"Ket ti Dios kinunana, Adda coma lawag: ket nag...","[ket, ti, dios, kinunana, adda, coma, lawag, k..."


## Viterbi Algorithm

### Determiner Checker

In [9]:
def isDtmn(word):
    """
    This function checks if the specific word in the sentence is a determiner, and extracts it.
    """
    if word in (noun_dtmn_list + adv_dtmn_list + prepo_dtmn_list + adv_time_list): # if the word is a determiner 
        ans = True
    else:
        ans = False

    return ans

### Verb Affixer Checker

In [11]:
def hasVerbAffixes(word, prev_word, isTagged, hasVerbAffixes):
    """
    This function checks if the specific word in the sentence has a verb affix, and extracts it.
    """
    for prefix in PREFIX_SET:
        if word.startswith(prefix) and not isTagged:
            hasVerbAffixes = True
            isTagged = True
    
    for suffix in SUFFIX_SET:
        """
        words ending with 'ang' are adverbs and after the adverbs are the nouns 
        """
        if word.endswith(suffix) and not isTagged:
            hasVerbAffixes = True
            isTagged = True
    
    return hasVerbAffixes
# end of hasVerbAffixes

### Verb Checker

In [None]:
def isVerb(word, prev_word, next_word, hasVerbAffixes):
    """
    This function tags if the specific word in the sentence is a verb, and extracts it.
    """
    isDone = False
    isVerb = False
    if word not in (PREPO_SET + PER_PRONOUN + CONJ_SET):
        if prev_word not in (noun_dtmn_list + adv_dtmn_list + prepo_dtmn_list): 
            if next_word in (noun_dtmn_list): 
                """
                if the previous word is not in the noun, adverb, and preposition determiner and 
                the next word is a noun determiner
                """
                if hasVerbAffixes:
                    """
                    if the current word has a verb affix/es, then it is a verb
                    """
                    isVerb = True
                    isDone = True
            
            if next_word in PER_PRONOUN:
                """
                if the next word is a personal pronoun
                eg. (insert an example sentence)
                issue: check if there's an issue
                """
                isVerb = True
                isDone = True
                
    if hasVerbAffixes and prev_word == None and not isDone:
        """
        insert comment here
        """
        isVerb = True
        isDone = True
    
    return isVerb
            
    

## Inserting the tagged words in the Dictionary

In [10]:
isTagged = None
hasVerbAffixes = None
dtmn_sen_list = []
sentence_list = dict_sm_il['Tokenized']
"""
instantiations of the variables
"""

for sentence in sentence_list:
    dtmn_list = []
    """
    instantiations of the variables
    """
    for word in sentence:
        
        isTagged = False
        hasVerbAffixes = False
        """
        instantiations of the variables
        """
        try:
            next_word = sentence[sentence.index(word) + 1]
        except (ValueError, IndexError):
            next_word = None
        """
        gets the next word in the sentence
        """
        
        if isDtmn(word):
            """
            checks if the word is a determiner
            """
            dtmn_list.append(word)
            isTagged = True
            
        try:
            hasVerbAffixes = hasVerbAffixes(word, prev_word, isTagged, hasVerbAffixes)
        except (ValueError, IndexError):
            hasVerbAffixes = False
        """
        checks if the word has verb affixes
        """
        
        prev_word = word
        """
        getting the previous word
        """
        
        try:
            prev2_word = sentence[sentence.index(word) - 1]
        except (ValueError, IndexError):
            prev2_word = None
        """
        getting the previous after the previous word
        """
            
    dtmn_sen_list.append(dtmn_list)
    """
    storing the words in the list to the list of sentences
    """

dict_sm_il['Determiner'] = dtmn_sen_list
dict_sm_il.head(30)

Unnamed: 0,Sentence,Tokenized,Determiner
0,GENESIS,[genesis],[]
1,Ti Aramid ti Dios a Namarsua.,"[ti, aramid, ti, dios, a, namarsua, ]","[ti, ti]"
2,Idi punganay ti Dios pinarsuana dagiti langlan...,"[idi, punganay, ti, dios, pinarsuana, dagiti, ...","[idi, ti, dagiti, ken, ti]"
3,Ket ti daga adda idi a gosogoso ken awan nagya...,"[ket, ti, daga, adda, idi, a, gosogoso, ken, a...","[ket, ti, adda, idi, ken, ket, dagiti, idi, it..."
4,"Ket ti Dios kinunana, Adda coma lawag: ket nag...","[ket, ti, dios, kinunana, adda, coma, lawag, k...","[ket, ti, adda, ket, ti]"
5,Ket ti Dios nakitana a ti lawag naimbag: ket i...,"[ket, ti, dios, nakitana, a, ti, lawag, naimba...","[ket, ti, ti, ket, ti, ti]"
6,"Ket ti Dios ninaganna ti lawag aldaw, ket dagi...","[ket, ti, dios, ninaganna, ti, lawag, aldaw, k...","[ket, ti, ti, ket, dagiti, ti, ket, ti, ken, ti]"
7,"Ket ti Dios kinunana, adda coma maysa a tangat...","[ket, ti, dios, kinunana, adda, coma, maysa, a...","[ket, ti, adda, iti, dagiti, dagiti]"
8,"Ket ti Dios inaramidna ti tangatang, ket insin...","[ket, ti, dios, inaramidna, ti, tangatang, ket...","[ket, ti, ti, ket, dagiti, adda, iti, ti, adda..."
9,"Ket ti Dios ninaganna, ti tangatang, Langit. K...","[ket, ti, dios, ninaganna, ti, tangatang, lang...","[ket, ti, ti, ket, ti, ken, ti]"
