# Ilokano Part of Speech Tagger

## Initialization of Data Sets

### Source Data Set

In [215]:
import pandas as pd

# Opening the file
sample_il_raw = open("src/text data/Bible_Ilokano.txt").read()

# Splitting the text into sentences
parsed_sp_il_raw = sample_il_raw.split("\n")

# Creating a dataframe
dict_sm_il = pd.DataFrame(parsed_sp_il_raw, columns = ['Sentence'])

# Printing the first 5 rows of the DataFrame
dict_sm_il.head()

Unnamed: 0,Sentence
0,GENESIS
1,Ti Aramid ti Dios a Namarsua.
2,Idi punganay ti Dios pinarsuana dagiti langlan...
3,Ket ti daga adda idi a gosogoso ken awan nagya...
4,"Ket ti Dios kinunana, Adda coma lawag: ket nag..."


### Determiners Data Set

In [216]:
noun_dtmn_list = ["dagiti", "ti", "cadagiti", "kadagiti", "ni", "ken", "ni", "coma", "koma", "a"] # Noun Determiners * Tinanggal ko si ket

adv_dtmn_list = ["idi", "iti"]

prepo_dtmn_list = ["ti", "addaak", "iti"] # Preposition Determiners * Tinanggal ko muna yung adda
# added "iti" to the list (eg. nagtignay iti = ay sumasa)

adv_time_list = ['madamdama', 'ita', 'kalman', 'inton bigat', 'ditoy', 'idiay']

### Affixes Data Set

In [217]:
PREFIX_SET = [
'na', 'ag', 'ka', 'nag', 'im',
'maki', 'panna', 'maka', 'naki', 'naka', 'nang', 'makapag',
'mang', 'agan', 'agay', 'pananga', 'agam', 'nagpa', 'magpa', 
'ipa', 'pag', 'pam', 'taga', 'i', 'napa', 'in', 'manang',
'ma' # a translation for 'ma'
'para', 'pang', 'panag', 'nai', 'manag', 'man', 'kina',
'nai', 'nai', 'nagpa' # nangi
]

Adj_Prefix =[
'ka', 
'na' # a translation for 'ma'
]

INFIX_SET = ['in'] # eg. 'in' in 'kinunana' (sinabi)
"""
infix sa tagalog ay prefix sa ilokano
sumigaw = inpukaw
"""

SUFFIX_SET = [
'to', 'nto', 'ak' 'en'
# no translation for syon, dor, ita, ing
'na', 'an', 'm'
]

Adj_Suffix = [
'an'
]
PREPO_SET = [
    'tengnga',
    'rabaw', 'rabao', 'baba', 'babaen', 
    'ngatuen', 'ngato', 'sirok', 'sidong',
    'sango', 'sarang', 'saklang', 'sanguanan' 'likud', 
    'ruar', 'uneg',
    'baet', 'sango', 'umuna'
    'ngudo', 'ungto', 'abay', 'igid'
]

CONJ_SET = [
    'ken', 'ket', # no  translation for word 'bali' 
    'gapu', 'ta', 'agsipud',    'laeng', 'ngem', 'nupay kasta',
    'bayat', 'uray', 
    'intono', 'no', 'ta', 'ngamin', 
    'kaso', 'gapuna', 
    'ngem', 'idi',
    'nga', 
    'ni',  'wenno', 
    'para', 'tapno', 'agraman', 
    'numpay kasta', 
    'ken', 'ket', 'kabayatanna', 'bayat', 
    'kada'
]
PER_PRONOUN = [
    'siak', 'sika', 'isu', 'dakami', 'datayo', 'dakayo', 'kayo', 'da',
    'dinak', 'diak', 'kaniak', # no translation for 'siyang' 
    'kadakami', 'kami', # -kami translation is usually connected to another word i.e., 'Maragsakankami'
    'kadakayo', 'dakayo', 'kayo', # -kayo translation is usually connected to another word i.e., 'Umaykayo'
    'ida', 'da', # -da translation is usually connected to another word i.e., 'nagtultuloyda'
    'ko', # -ko translation is usually connected to another word i.e., 'Kayatko'
    # no translation for 'sakin'
    'kukuami', 'kadatayo', 'kukuatayo', 'tayo', # -tayo translation is usually connected to another word i.e., 'Basaentayo'
    # no translation for 'kong' and 'inyong'
    'kata', 'mo', # -mo is usually connected to another word
    'kenkuana', 'mi', # -mi translation is usually connected to another word i.e., 'Insuratmi'
    'yo', 'nyo' # both are usually connected to another word
    'na', # can stand alone and can be connected to another word
]

### Other Sets


In [218]:
vowels = ['a', 'e', 'i', 'o', 'u']

## Cleaning the Data

### Removing the Punctuation/s

In [219]:
import string

def remove_punct(Text):
    text_nopumct = "".join([char for char in Text if char not in string.punctuation])
    return text_nopumct

cleaned_sp_il = [remove_punct(word) for word in parsed_sp_il_raw]

### Tokenizing

In [220]:
import re

def tokenize(text):
    tokens = re.split('\W+', text.lower())
    
    for token in tokens:
        
        try:
            next_token = tokens[tokens.index(token) + 1]
        except (ValueError, IndexError):
            next_token = None
        """
        gets the next word in the sentence
        """
        
        try:
            next2_token = tokens[tokens.index(token) + 2]
        except (ValueError, IndexError):
            next2_token = None
        """
        gets the next word in the sentence
        """
        
        if token == 'naaramid' and next_token == 'a' and next2_token == 'casta':
            temp_token = token + " " + next_token + " " + next2_token
            tokens[tokens.index(token)] = temp_token
            tokens.remove(next_token)
            tokens.remove(next2_token)
            
        if token == '':
            tokens.remove(token)
    return tokens



tokenized_sp_tl = [tokenize(word) for word in parsed_sp_il_raw]

dict_sm_il['Tokenized'] = tokenized_sp_tl
dict_sm_il.head()

Unnamed: 0,Sentence,Tokenized
0,GENESIS,[genesis]
1,Ti Aramid ti Dios a Namarsua.,"[ti, aramid, ti, dios, a, namarsua]"
2,Idi punganay ti Dios pinarsuana dagiti langlan...,"[idi, punganay, ti, dios, pinarsuana, dagiti, ..."
3,Ket ti daga adda idi a gosogoso ken awan nagya...,"[ket, ti, daga, adda, idi, a, gosogoso, ken, a..."
4,"Ket ti Dios kinunana, Adda coma lawag: ket nag...","[ket, ti, dios, kinunana, adda, coma, lawag, k..."


## Viterbi Algorithm

### Determiner Checker

In [221]:
def isDtmn(word):
    """
    This function checks if the specific word in the sentence is a determiner, and extracts it.
    """
    if word in (noun_dtmn_list + adv_dtmn_list + prepo_dtmn_list + adv_time_list): # if the word is a determiner 
        ans = True
    else:
        ans = False

    return ans

### Verb Affixer Checker

In [222]:
def check_verb_affixes(word, prev_word, isTagged, hasVerbAffixes):
    """
    This function checks if the specific word in the sentence has a verb affix, and extracts it.
    """
    for prefix in PREFIX_SET:
        if word.startswith(prefix) and not isTagged:
            hasVerbAffixes = True
            isTagged = True
            
    for infix in INFIX_SET:
        if word.__contains__(infix) and not isTagged:
            """
            eg. kinunana = sinabi
            """
            hasVerbAffixes = True
            isTagged = True
    
    for suffix in SUFFIX_SET:
        """
        words ending with 'ang' are adverbs and after the adverbs are the nouns 
        """
        if word.endswith(suffix) and not isTagged:
            hasVerbAffixes = True
            isTagged = True
    
    return hasVerbAffixes
# end of hasVerbAffixes

### Verb Checker

In [223]:
def isVerb(word, prev_word, prev2_word, next_word, next2_word, hasVerbAffixes):
    """
    This function tags if the specific word in the sentence is a verb, and extracts it.
    """
    isDone = False
    isVerb = False
    
    if prev_word == 'awan':
        """
        if the previous word is 'awan' then it is not a verb
        eg. 'Awan nagyan' = 'walang laman'
        """
        isVerb = False
        isDone = True
        
    if word == 'espiritu' and not isDone:
        """
        if the word is 'espiritu' then it is not a verb
        eg. 'Esperitu' = 'spirit' (ti espiritu ti Dios)
        issue: there might be more words that vae a previous word ti and next word ti that is not a verb
        maybe noun database will solve this issue
        """
        isVerb = False
        isDone = True
        
    if word == 'adda' and not isDone:
        """
        Same issue that this word is a root word
        adda = 'mayroon or magkaroon'
        maybe verb database will solve this issue
        """
        if next_word == 'idi':
            """
            if the next word is 'idi' then it is an adjecive
            eg. adda idi = walang/ hindi mayroon
            """
            isVerb = False
            isAdj = True
            isDone = True
            
        if next_word in ('coma', 'koma') and not isDone:
            """
            if the next word is 'comma' or 'koma' then it is a verb
            eg. adda comma = magkaroon ng
            issue: there might be more words that have delimiter as a next word that is a verb
            """
            isVerb = True
            isDone = True
            
    if word == 'nagtignay' and not isDone:
        if next_word == 'iti':
            """
            if the word is 'nagtignay' and next word is 'iti' then it is a verb
            then it is a propositional determiner
            eg. nagtignay iti = sumasa / ay sumasa
            """
            isVerb = False
            isDone = True
            
    if word == 'naimbag' and not isDone:
        """
        naimbag = 'maganda', 'na maganda'
        """
        if next_word == 'iti':
            """
            if the next word is 'nga' then it means
            naimbag nga = "magandang", "maganda ang"
            """
        isVerb = False
        isAdj = True
        isDone = True
    
    if word == 'ninagananna' and not isDone:
        """
        ninagananna = "tinawag", "tinawag + niya"
                        "pinangalan", "pinangalan + niya"
        ninaganna = "tinawag", "tinawag + ng"
                    "pinangalan", "pinangalan + ng"
        """
        can2Viterbi = True
        isVerb = True
        isDone = True
        
    if word == 'naaramid a casta' and not isDone:
        """
        naaramid a casta = 'nagkagayon'
        """
        isVerb = True
        isDone = True
    
    if word not in (PREPO_SET + PER_PRONOUN + CONJ_SET) and not isDone:
        if prev_word not in (noun_dtmn_list + adv_dtmn_list + prepo_dtmn_list): # if the previous word is not a determiner
            if next_word in (noun_dtmn_list): 
                """
                if the previous word is not in the noun, adverb, and preposition determiner and 
                the next word is a noun determiner
                """
                if hasVerbAffixes:
                    """
                    if the current word has a verb affix/es, then it is a verb
                    """
                    isVerb = True
                    isDone = True
            
            if next_word in PER_PRONOUN and not isDone:
                """
                if the next word is a personal pronoun
                eg. (insert an example sentence)
                issue: check if there's an issue
                """
                isVerb = True
                isDone = True                
        
        if prev_word == 'ti' and next_word in (noun_dtmn_list) and next_word != 'a' and not isDone:
            """
            if the previous word is 'ti' and the next word is a noun determiner
            eg. ti aramid ti dios (Nilalang ng Dios)
            """
            isVerb = True
            isDone = True
            
        if prev2_word == 'ti' and not isDone:
            if next_word in (noun_dtmn_list):
                """
                if the previous of previous word is 'ti' and the next word is a noun determiner
                eg. ti Dios pinarsuana dagiti (ay nilikha ng Dios)
                """
                isVerb = True
                isDone = True
                
            if hasVerbAffixes and not isDone:
                """
                if the current word has a verb affix/es, then it is a verb
                """
                isVerb = True
                isDone = True
        
    if hasVerbAffixes and prev_word == None and not isDone:
        """
        if the current word has a verb affix/es and the previous word is None
        """
        isVerb = True
        isDone = True
    
    return isVerb

### Noun Checker

In [224]:
def isNoun(word, prev_word, prev2_word, next_word, next2_word):
    """
    This function tags if the specific word in the sentence is a noun, and extracts it.
    """
    isDone = False
    isNoun = False
    
    if word in PER_PRONOUN:
        """
        if the word is a personal pronoun, then it is a noun
        """
        isNoun = True
        isDone = True

    if word and not isDone:
        if prev_word in (noun_dtmn_list) and word not in (PREPO_SET + CONJ_SET + noun_dtmn_list):
  
            if not word.startswith("maica"):
                """
                if previous word is a and the word does not start with maica, then it is a noun
                e.g. aldaw a maicadua -> nattag kasi maicadua pag wala tong condition
                """
                isNoun = True
                isDone = True
                
            if next2_word.startswith("maica") and next_word == "a":
                """
                if next next word starts with maic prefix and next word is a, then it is a noun
                e.g. aldaw a maicadua -> di nattag aldaw since wala siyang noun_dtmn before aldaw
                """
                isNoun = True
                isDone = True

            if word[0:2] == word[2:4]:
                if prev_word in (noun_dtmn_list) and next_word not in ("ti", "nga", "a"):
                    """
                    if the first two letters of a word is repeated and next_word is not ti/nga/a, then it is a noun
                    e.g. dadackel -> adjective dapat
                    """
                    isNoun = True
                    isDone = True
                else:
                    isNoun = False
                    isDone = False
            
            if word[:3] == word[3:6]:
                if prev_word in (noun_dtmn_list) and next_word not in (noun_dtmn_list):
                    isNoun = False
                    isDone = False
        #a dadackel ti
        #ti dadackel nga
        #ti dadackel a

        #a lalaki ken
        #ti lalaki amin
                    
    return isNoun
# end of function

### Adjective Checker

In [225]:
def isAdj(word, prev_word, prev2_word, next_word, hasVerbAffixes):
    """
    This function tags if the specific word in the sentence is an adjective, and extracts it.
    """
    isDone = False
    isAdj = False
        
    if word not in (noun_dtmn_list + adv_dtmn_list + prepo_dtmn_list + PREPO_SET + PER_PRONOUN + CONJ_SET):
            
        if word.startswith("na") and (next_word in noun_dtmn_list or next_word == 'a') and  not hasVerbAffixes and not isDone:
            """
            if the word is an adjective it has an adjective prefix 'na' and the next word is noun determiner
            eg. napintas ti balay (maganda ang bahay)
            eg. naimbag a bigat (magandang umaga)
            """
            isAdj = True
            isDone = True

        if word.startswith("ka") and word.endswith("an") and not isDone:
            """
            if the word is an adjective it has an adjective prefix 'ka' and adjective suffix 'an' and its a superlative adjective
            eg. kadakkelan (pinakamalaki)
            """
            isAdj = True
            isDone = True 
    
        if (word.find("una") != -1) and (next_word == 'a' or next_word == 'nga') and  not hasVerbAffixes and not isDone:
            """
            if the word is an adjective it has a word 'una' and next word is 'a' or 'nga'
            eg. umuna a bilin (unang bilin)
            eg. immuna nga arida (unang hari)
            """
            isAdj = True
            isDone = True

        if word.startswith("maika") or word.startswith("maica"):
            """
            if the word is an adjective it has an adjective prefix 'maika' or 'maica' and its an ordinal adjective
            eg. maicadua (ikalawang)
            """
            isAdj = True
            isDone = True 

        if word[:3] == word[3:6] and (next_word in noun_dtmn_list or prev_word == 'a') and  not hasVerbAffixes and not isDone:
            """
            if the word is an adjective it repeats the first 3 letters to make it comparative
            eg. dakdakkel, basbassit
            """
            isAdj = True
            isDone = True 

        if word[:2] == word[2:4] and (next_word in noun_dtmn_list or prev_word == 'a') and  not hasVerbAffixes and not isDone:
            """
            if the word is an adjective it repeats the first 2 letters
            eg. dadakkel (malalaking), babassit (maliliit)
            """
            isAdj = True
            isDone = True

        if word.startswith("na") and word[2:5] == word[5:8] and not isDone:
            """
            if the word is an adjective it repeats the next 3 letters after 'na' to make it comparative
            eg. nalaklaka, napinpintas
            """
            isAdj = True
            isDone = True
        
        if word.startswith("na") and word[2:6] == word[6:10] and not isDone:
            """
            if the word is an adjective it repeats the next 4 letters after 'na' to make it comparative
            eg. nasingsingpet
            """
            isAdj = True
            isDone = True
            
    return isAdj
# end of function

### Adverb Checker

In [226]:
def isAdv(word, prev_word, next_word, next2_word, hasVerbAffixes):
    """
    This function tags if the specific word in the Ilokano sentences is an adverb, and extracts it.
    """
    isDone = False
    isAdv = False
    
    if word not in PER_PRONOUN:
        if word.startswith('idi') or word.startswith('di') and not prev_word == 'nga' and not isDone:
           """
           If the word starts with idi and has nga as its next word it is an adverb describing an adjective
           
           """
           isAdv = True
           isDone = True
           
        if word in adv_time_list and not isDone:
            """
            If the word is in the adverb of time list, then it is an adverb
            """ 
            isAdv = True
            isDone = True
            
        if prev_word in adv_dtmn_list and not isVerb and not isNoun and not isDone:
            """
            If the word's previous word is in the determiner's list and not a verb or a noun, then it is n adverb
            """
            isAdv = True
            isDone = True
             
        if next_word =='nga' or next_word == 'a' and word.startswith("na") and not isDone: 
           """
           If the word starts with na and has nga as its next word it is an adverb describing an adjective
           eg. napartak nga iyaadu = mabilis na pagdami, Napigsa a tudo = malakas na ulan
           """   
           isAdv = True
           isDone = True 
           
        # if next_word == 'a' and next2_word isAdj and not hasVerbAffixes and not isDone:
        #     """
        #     If the next word is a and has no Verb affixes, then the word is an adverb
        #     eg. tiyak na maganda ang kinabukasan ng mga ... =  sigurado a naraniag ti masakbayan dagidiay...
        #     """
        #     isAdv = True
        #     isDone = True
            
        if word.startswith('na') and not next_word in noun_dtmn_list and not isDone:
            """
            If the next word is not a noun dtrmr and the word starts with 'a'
            eg. mabilis na naglalaho = napartak a mapukpukaw
            """
            isAdv = True
            isDone = True
            
        if word == "awan" and not next_word in noun_dtmn_list or isNoun and not isDone:
            """
            If the next word is not a noun or pronoun and if the word is Awan, then it is adverb
            """
            
            isAdv = True
            isDone = True
                  
    return isAdv

### Preposition Checker

In [227]:
def isPrepo(word, prev_word):
    """
    This function checks if the specific word in the sentence is a preposition, and extracts it.
    """
    isPrepo = False
    prev_word = ""
    
    if prev_word in (prepo_dtmn_list) and word in (PREPO_SET):
        isPrepo = True
        
    return isPrepo
# end of function

### Conjunction Checker

In [228]:
def isConj(word):
    """
    This function checks if the specific word in the sentence is a conjunction
    """
    if word in CONJ_SET:
        return True
    else:
        return False
# end of function

## Inserting the tagged words in the Dictionary

In [229]:
def tag(sentence_list):
    isTagged = None
    hasVerbAffixes = None
    sw_sen_list = []
    dtmn_sen_list = []
    conj_sen_list = []
    verb_sen_list = []
    noun_sen_list = []
    adj_sen_list = []
    adv_sen_list = []
    prepo_sen_list = []
    unkn_sen_list = []
    pos_sen_list = []
    """
    instantiations of the variables
    """

    for sentence in sentence_list:
        sw_list = [] # list of words in the sentence that has a single word
        dtmn_list = []
        conj_list = []
        verb_list = []
        noun_list = []
        adj_list = []
        adv_list = []
        prepo_list = []
        unkn_list = []
        pos_list = []
        prev_word = None
        prev2_word = None
        sen_len = len(sentence)
        
        """
        instantiations of the variables
        """
        for word in sentence:
            
            isTagged = False
            hasVerbAffixes = False
            """
            instantiations of the variables
            """
            try:
                next_word = sentence[sentence.index(word) + 1]
            except (ValueError, IndexError):
                next_word = ""
            """
            gets the next word in the sentence
            """
            
            try:
                next2_word = sentence[sentence.index(word) + 2]
            except (ValueError, IndexError):
                next2_word = ""
            """
            gets the next word in the sentence
            """
            
            try:
                hasVerbAffixes = check_verb_affixes(word, prev_word, isTagged, hasVerbAffixes)
            except (ValueError, IndexError):
                hasVerbAffixes = False
            """
            checks if the word has verb affixes
            """
            
            if sen_len == 1:
                """
                if the sentence is only one word long
                """
                sw_list.append(word)
                pos_list.append('SW')
                isTagged = True
            
            elif isDtmn(word) and not isTagged:
                """
                checks if the word is a determiner
                """
                dtmn_list.append(word)
                pos_list.append('DT')
                isTagged = True
                
            elif isConj(word) and not isTagged:
                """
                checks if the word is a conjunction and not tagged
                """
                conj_list.append(word)
                pos_list.append('CC')
                isTagged = True
            
            elif isVerb(word, prev_word, prev2_word, next_word, next2_word, hasVerbAffixes) and not isTagged:
                """
                checks if the word is a determiner
                """
                verb_list.append(word)
                pos_list.append('VB')
                isTagged = True

            elif isNoun(word, prev_word, prev2_word, next_word, next2_word) and not isTagged:
                """
                checks if the word is a determiner
                """
                noun_list.append(word)
                pos_list.append('NN')
                isTagged = True

            elif isAdj(word, prev_word, prev2_word, next_word, hasVerbAffixes) and not isTagged:
                """
                checks if the word is an adjective and not tagged
                """
                adj_list.append(word)
                pos_list.append('JJ')
                isTagged = True

            elif isAdv(word, prev_word, next_word, next2_word, hasVerbAffixes) and not isTagged:
                """
                checks if the word is an adverb and not tagged
                """
                adv_list.append(word)
                pos_list.append('RB')
                isTagged = True
            
            elif isPrepo(word, prev_word) and not isTagged:
                """
                checks if the word is a preposition and not tagged
                """
                prepo_list.append(word)
                pos_list.append('PR')
                isTagged = True
                
            else:
                """
                if the word is not tagged, then it is an unknown word
                """
                unkn_list.append(word)
                pos_list.append('UNK')
                isTagged = True
            
            prev_word = word
            """
            getting the previous word
            """
            
            try:
                prev2_word = sentence[sentence.index(word) - 1]
            except (ValueError, IndexError):
                prev2_word = None
            """
            getting the previous after the previous word
            """
            
        sw_sen_list.append(sw_list)
        dtmn_sen_list.append(dtmn_list)
        conj_sen_list.append(conj_list)
        verb_sen_list.append(verb_list)
        noun_sen_list.append(noun_list)
        adj_sen_list.append(adj_list)
        adv_sen_list.append(adv_list)
        prepo_sen_list.append(prepo_list)
        unkn_sen_list.append(unkn_list)
        pos_sen_list.append(pos_list)
        """
        storing the words in the list to the list of sentences
        """

    #dict_sm_il['Single Word'] = sw_sen_list
    #dict_sm_il['Determiner'] = dtmn_sen_list
    #dict_sm_il['Conjunction'] = conj_sen_list
    dict_sm_il['Verb'] = verb_sen_list
    dict_sm_il['Noun'] = noun_sen_list
    dict_sm_il['Adjective'] = adj_sen_list
    #dict_sm_il['Adverb'] = adv_sen_list
    #dict_sm_il['Preposition'] = prepo_sen_list
    #dict_sm_il['Unknown'] = unkn_sen_list
    #dict_sm_il['POS'] = pos_sen_list

tag(dict_sm_il['Tokenized'])

dict_sm_il.head(30)

Unnamed: 0,Sentence,Tokenized,Verb,Noun,Adjective
0,GENESIS,[genesis],[],[],[]
1,Ti Aramid ti Dios a Namarsua.,"[ti, aramid, ti, dios, a, namarsua]",[aramid],[],[]
2,Idi punganay ti Dios pinarsuana dagiti langlan...,"[idi, punganay, ti, dios, pinarsuana, dagiti, ...",[pinarsuana],[],[]
3,Ket ti daga adda idi a gosogoso ken awan nagya...,"[ket, ti, daga, adda, idi, a, gosogoso, ken, a...",[],[],[]
4,"Ket ti Dios kinunana, Adda coma lawag: ket nag...","[ket, ti, dios, kinunana, adda, coma, lawag, k...","[kinunana, adda, nagadda]",[],[]
5,Ket ti Dios nakitana a ti lawag naimbag: ket i...,"[ket, ti, dios, nakitana, a, ti, lawag, naimba...","[nakitana, inlasin]",[],[]
6,"Ket ti Dios ninaganna ti lawag aldaw, ket dagi...","[ket, ti, dios, ninaganna, ti, lawag, aldaw, k...","[ninaganna, ninagananna, naadda, malem]",[],[]
7,"Ket ti Dios kinunana, adda coma maysa a tangat...","[ket, ti, dios, kinunana, adda, coma, maysa, a...","[kinunana, adda]",[],[]
8,"Ket ti Dios inaramidna ti tangatang, ket insin...","[ket, ti, dios, inaramidna, ti, tangatang, ket...","[inaramidna, insinana, naaramid a casta]",[],[]
9,"Ket ti Dios ninaganna, ti tangatang, Langit. K...","[ket, ti, dios, ninaganna, ti, tangatang, lang...","[ninaganna, naadda, malem, aldaw]",[],[maicadua]


## Tester

In [230]:
#temp_sen = dict_sm_il['Tokenized'][1]
#temp_pos = dict_sm_il['Verb'][1]
#temp_unkn = dict_sm_il['POS'][1]

#print(temp_sen)
#print(temp_pos)
#print(temp_unkn)


In [231]:
temp_sen = ['naaramid', 'a', 'casta']

for word in temp_sen:
    
    try:
        next_word = temp_sen[temp_sen.index(word) + 1]
    except (ValueError, IndexError):
        next_word = None
    """
    gets the next word in the sentence
    """
    
    try:
        next2_word = temp_sen[temp_sen.index(word) + 2]
    except (ValueError, IndexError):
        next2_word = None
    """
    gets the next word in the sentence
    """
    
    if word == 'naaramid' and next_word == 'a' and next2_word == 'casta':
        temp_word = word + " " + next_word + " " + next2_word
        temp_sen[temp_sen.index(word)] = temp_word
        temp_sen.remove(next_word)
        temp_sen.remove(next2_word)

    word1 = "dadackel"
    first_second = word1[0:2]
    third_fourth = word1[2:4]

print(first_second)
print(third_fourth)

print(temp_word)
print(temp_sen)


da
da
naaramid a casta
['naaramid a casta']


## Exporting the dictionary in the json file

In [232]:
import json

dictionary = dict_sm_il.to_dict('records')

try:
    with open("src/json data/il_pos.json", "w") as outfile:
        json.dump(dictionary, outfile)
    print("successfully saved the json file")
except:
    print("Error in saving the json file")

successfully saved the json file
