In [39]:
import pandas as pd
import conllu as cl
from os.path import join
import math

In [40]:
def load(language):
    filePath = ".\Datasets\\" + language + "\\"
    
    with open(join(filePath, "corpus.conllu"), 'r', encoding='utf-8') as file:
        data = file.read()
    train_sentences = cl.parse(data)

    with open(join(filePath, "testCorpus.conllu"), 'r', encoding='utf-8') as file:
        data = file.read()
    test_sentences = cl.parse(data)

    with open(join(filePath, "val.conllu"), 'r', encoding='utf-8') as file:
        data = file.read()
    val_sentences = cl.parse(data)

    return train_sentences, test_sentences, val_sentences

In [41]:
# Fit the HMM model with the train sentences, it generates 2 outputps containing the probabilities that will be used 
# in the decoding phase.
def HMMFit(train_sentences):
    # dictionary of dictionaries
    countTag = {}   # Count(tag(i))
    transitionCount = {}    # Count(tag(i-1), tag(i))
    wordTagCount = {}   # Count(tag(i), word(i))
    tagDistribution = {}    # Count(tag(i-1), tag(i)) / Count(tag(i-1))
    wordDistribution = {}   # Count(tag(i), word(i))

    # complete creation of counTag dictorary
    # partial initialization of transitionCount dictionary
    # partial creation of wordTagCount dictionary
    for sentence in train_sentences:
        for word in sentence:
            if word['lemma'] in countTag:
                countTag[word['lemma']] += 1
            else:
                countTag[word['lemma']] = 1
                transitionCount[word['lemma']] = {}
            
            if word['form'] not in wordTagCount:
                wordTagCount[word['form']] = {}
            if word['lemma'] in wordTagCount[word['form']]:
                wordTagCount[word['form']][word['lemma']] += 1
            else:
                wordTagCount[word['form']][word['lemma']] = 1
    
    # complete inizialization of transitionCount dictionary
    for key in transitionCount.keys():
        for tag in countTag.keys():
            transitionCount[key][tag] = 0

    # complete creation of wordTagCount dictionary
    for word in wordTagCount.keys():
        for tag in countTag.keys():
            if tag not in wordTagCount[word]:
                wordTagCount[word][tag] = 0

    # complete creation of transitionCount dictionary
    # previousTag --> Markov's hypothesis of first grade
    for tag in transitionCount.keys():
        for sentence in train_sentences:
            previousTag = None
            for word in sentence:
                if previousTag == None:
                    previousTag = word['lemma']
                else:
                    if word['lemma'] == tag:
                        if previousTag in transitionCount[tag]:
                            transitionCount[tag][previousTag] +=1
                    previousTag = word['lemma']
    
    # complete creation of tagDistribution dictionary
    # we use the log-probability for better accuracy (since the probability are very small)
    # if the probability is 0, then the log-probability will be log(10**-10) 
    epsilon = 10**-10
    for eventTag in countTag.keys():
        tagDistribution[eventTag] = {}
        for condTag in countTag.keys():
            if transitionCount[eventTag][condTag]/countTag[condTag] == 0:
                tagDistribution[eventTag][condTag] = math.log(epsilon)
            else:
                tagDistribution[eventTag][condTag] = math.log(transitionCount[eventTag][condTag]/countTag[condTag])
            
    # complete creation of wordDistribution dictionary
    # we use the log-probability for better accuracy (since the probability are very small)
    # if the probability is 0, then the log-probability will be log(10**-10)
    for word in wordTagCount.keys():
        wordDistribution[word] = {}
        for tag in countTag.keys():
            if wordTagCount[word][tag]/countTag[tag] == 0:
                wordDistribution[word][tag] = math.log(epsilon)
            else:
                wordDistribution[word][tag] = math.log(wordTagCount[word][tag]/countTag[tag])
        
    return tagDistribution, wordDistribution

In [87]:
def viterbiAlgorithm(sentence, tagDistribution, wordDistribution, b_value):
    viterbi = {}
    backpointer = {}

    # todo: add the descrpt.
    for word in sentence:
        viterbi[word['form']] = {}
        backpointer[word['form']] = {}

    # initialization step
    for tag in tagDistribution.keys():
        viterbi[(sentence[0])['form']][tag] = wordDistribution[(sentence[0])['form']][tag] * b_value
        backpointer[(sentence[0])['form']][tag] = 0

    # recursion step
    print(len(sentence))
    print(sentence)
    for i in range(1, len(sentence)):
        print((sentence[i])['form'])
        for tag in tagDistribution.keys():
            viterbi[(sentence[i])['form']][tag] = max(viterbi[(sentence[i-1])['form']][tagLoop] + wordDistribution[(sentence[i])['form']][tagLoop] + tagDistribution[tag][tagLoop] for tagLoop in tagDistribution.keys())
            backpointer[(sentence[i])['form']][tag] = max(viterbi[(sentence[i-1])['form']][tagLoop] + tagDistribution[tag][tagLoop] for tagLoop in tagDistribution.keys())
                
    # termitation step

    return viterbi


In [43]:
def HMMPredict(test_senteces, tagDistribution, wordDistribution):
    b_value = 1
    solution = {}
    backpointer = {}
    
    for sentence in test_senteces:
        solution = viterbiAlgorithm(sentence, tagDistribution, wordDistribution, b_value)
        # recontruct and add to the solution...

    return solution

In [44]:
it_train_sentences, it_test_sentences, it_val_sentences = load("it")
#en_train_sentences, en_test_sentences, en_val_sentences = load("en")
#es_train_sentences, es_test_sentences, es_val_sentences = load("es")

In [None]:
tagDistribution, wordDistribution = HMMFit(it_train_sentences)

In [88]:
HMMPredict(it_test_sentences, tagDistribution, wordDistribution)

3
TokenList<Paolo, ama, Francesca>
ama
-3.58351893845611
-2.8903717578961645
-23.7189981105004
-3.58351893845611
Francesca
-7.16703787691222
-6.473890696352275
-5.375278407684165
-7.16703787691222


{'Paolo': {'N': -1.791759469228055,
  'V': -23.025850929940457,
  'A': -23.025850929940457,
  'AGG': -23.025850929940457},
 'ama': {'N': -5.375278407684165,
  'V': -4.68213122712422,
  'A': -24.412145291060344,
  'AGG': -5.375278407684165},
 'Francesca': {'N': -8.958797346140274,
  'V': -8.265650165580329,
  'A': -28.40112933762462,
  'AGG': -8.958797346140274}}