In [196]:
import pandas as pd
import conllu as cl
from os.path import join
import math

In [197]:
def reconstructSentences(sentences):
    modified_sentences = []
    for sentence in sentences:
        modified_sentence = []
        # create a start token
        new_token_start = {
            "id": len(sentence),
            "form": "<s>",
            "lemma": "START",
        }
        # create a end token
        new_token_end = {
            "id": len(sentence) + 1,
            "form": "</s>",
            "lemma": "END",
        }
        # append the start token
        modified_sentence.append(new_token_start)
        # append the rest of the sentence
        for word in sentence:
            modified_sentence.append(word)
        # append the end token
        modified_sentence.append(new_token_end)

        # append the new sentence to the list of sentences
        modified_sentences.append(modified_sentence)

    return modified_sentences

In [198]:
def load(language):
    filePath = ".\Datasets\\" + language + "\\"
    
    with open(join(filePath, "corpus.conllu"), 'r', encoding='utf-8') as file:
        data = file.read()
    train_sentences = reconstructSentences(cl.parse(data))

    with open(join(filePath, "testCorpus.conllu"), 'r', encoding='utf-8') as file:
        data = file.read()
    test_sentences = cl.parse(data)

    with open(join(filePath, "val.conllu"), 'r', encoding='utf-8') as file:
        data = file.read()
    val_sentences = cl.parse(data)

    return train_sentences, test_sentences, val_sentences

In [199]:
# Fit the HMM model with the train sentences, it generates 2 outputps containing the probabilities that will be used 
# in the decoding phase.
def HMMFit(train_sentences):
    # dictionary of dictionaries
    countTag = {}   # Count(tag(i))
    transitionCount = {}    # Count(tag(i-1), tag(i))
    wordTagCount = {}   # Count(tag(i), word(i))
    tagDistribution = {}    # Count(tag(i-1), tag(i)) / Count(tag(i-1))
    wordDistribution = {}   # Count(tag(i), word(i))

    # complete creation of counTag dictorary
    # partial initialization of transitionCount dictionary
    # partial creation of wordTagCount dictionary
    for sentence in train_sentences:
        for word in sentence:
            if word['lemma'] in countTag:
                countTag[word['lemma']] += 1
            else:
                countTag[word['lemma']] = 1
                transitionCount[word['lemma']] = {}
            
            if word['form'] not in wordTagCount:
                wordTagCount[word['form']] = {}
            if word['lemma'] in wordTagCount[word['form']]:
                wordTagCount[word['form']][word['lemma']] += 1
            else:
                wordTagCount[word['form']][word['lemma']] = 1
    
    # complete inizialization of transitionCount dictionary
    for key in transitionCount.keys():
        for tag in countTag.keys():
            transitionCount[key][tag] = 0

    # complete creation of wordTagCount dictionary
    for word in wordTagCount.keys():
        for tag in countTag.keys():
            if tag not in wordTagCount[word]:
                wordTagCount[word][tag] = 0

    # complete creation of transitionCount dictionary
    # previousTag --> Markov's hypothesis of first grade
    for tag in transitionCount.keys():
        for sentence in train_sentences:
            previousTag = None
            for word in sentence:
                if previousTag == None:
                    previousTag = word['lemma']
                else:
                    if word['lemma'] == tag:
                        if previousTag in transitionCount[tag]:
                            transitionCount[tag][previousTag] += 1
                    previousTag = word['lemma']
    
    # complete creation of tagDistribution dictionary
    # we use the log-probability for better accuracy (since the probability are very small)
    # if the probability is 0, then the log-probability will be log(10**-10) 
    epsilon = 10**-10
    for eventTag in countTag.keys():
        tagDistribution[eventTag] = {}
        for condTag in countTag.keys():
            if transitionCount[eventTag][condTag]/countTag[condTag] == 0:
                tagDistribution[eventTag][condTag] = math.log(epsilon)
            else:
                tagDistribution[eventTag][condTag] = math.log(transitionCount[eventTag][condTag]/countTag[condTag])
            
    # complete creation of wordDistribution dictionary
    # we use the log-probability for better accuracy (since the probability are very small)
    # if the probability is 0, then the log-probability will be log(10**-10)
    for word in wordTagCount.keys():
        wordDistribution[word] = {}
        for tag in countTag.keys():
            if wordTagCount[word][tag]/countTag[tag] == 0:
                wordDistribution[word][tag] = math.log(epsilon)
            else:
                wordDistribution[word][tag] = math.log(wordTagCount[word][tag]/countTag[tag])
        
    return tagDistribution, wordDistribution

In [212]:
def viterbiAlgorithm(sentence, tagDistribution, wordDistribution, b_value):
    epsilon = 10**-10
    columns = len(sentence) # words
    rows = len(tagDistribution.keys())-2 #tags
    viterbi = [[math.log(epsilon) for _ in range(columns)] for _ in range(rows)]
    backpointer = [["" for _ in range(columns)] for _ in range(rows)]

    # initialization step
    i = 0
    for tag in tagDistribution.keys():
        if tag != "START" and tag != "END": 
            viterbi[i][0] = tagDistribution[tag]["START"] + b_value
            backpointer[i][0] = "0"
            i += 1

    tagList = list(tagDistribution)
    tagList.remove("START")
    tagList.remove("END")

    # recursion step
    for i in range(1, len(sentence)):
        j = 0
        for tag in tagDistribution.keys():
            if tag != "START" and tag != "END":
                viterbi[j][i] = max(viterbi[tagLoop][i-1] + wordDistribution[(sentence[i])['form']][tag] + tagDistribution[tag][tagList[tagLoop]] for tagLoop in range(len(tagList)))
                _, maxTag = max((viterbi[tagLoop][i-1] + tagDistribution[tag][tagList[tagLoop]], tagLoop) for tagLoop in range(len(tagList)))
                backpointer[j][i] = maxTag
                j += 1

    # termitation step
    viterbi_end = max(viterbi[tagLoop][len(sentence)-1] + tagDistribution["END"][tagList[tagLoop]] for tagLoop in range(len(tagList)))
    _, maxTag = max((viterbi[tagLoop][len(sentence)-1] + tagDistribution["END"][tagList[tagLoop]], tagLoop) for tagLoop in range(len(tagList)))
    backpointer_end = maxTag
    
    # reconstructing the solution
    path = [None] * len(sentence)
    solution = [None] * len(sentence)
    path[len(sentence)-1] = backpointer_end
    solution[len(sentence)-1] = tagList[backpointer_end]
    for i in range(len(sentence)-2, -1, -1): # decrementing loop, 0 inclusive
        path[i] = backpointer[path[i+1]][i+1]
        solution[i] = tagList[path[i]]

    return solution

In [201]:
def evaluation(predict, test_sentences):
    predictions = 0
    correctPredictions = 0

    # accuracy
    i = 0
    for sentence in test_sentences:
        j = 0
        for word in sentence:
            if word['lemma'] == predict[i][j]:
                correctPredictions += 1
            predictions += 1
            j += 1
        i += 1
    accuracy = correctPredictions / predictions

    return accuracy

In [202]:
def HMMPredict(test_senteces, tagDistribution, wordDistribution):
    b_value = 0 # in log probabilities 0 is the neutral element
    solution = []

    for sentence in test_senteces:
        sentenceSolution = viterbiAlgorithm(sentence, tagDistribution, wordDistribution, b_value)
        # recontruct and add to the solution...
        solution.append(sentenceSolution)

    return solution

In [203]:
it_train_sentences, it_test_sentences, it_val_sentences = load("it")
#en_train_sentences, en_test_sentences, en_val_sentences = load("en")
#es_train_sentences, es_test_sentences, es_val_sentences = load("es")

In [204]:
tagDistribution, wordDistribution = HMMFit(it_train_sentences)

In [213]:
predict = HMMPredict(it_test_sentences, tagDistribution, wordDistribution)
predict

[['0', 2, 1], ['0', 0, 1], ['0', 0, 1], ['0', 0, 1]]
[['0', 2, 1, 2], ['0', 0, 0, 2], ['0', 0, 1, 2], ['0', 0, 0, 2]]


[['N', 'V', 'N'], ['N', 'V', 'A', 'N']]

In [211]:
evaluation(predict, it_test_sentences)

1.0