In [1]:
import pandas as pd
import conllu as cl
from os.path import join
import math

In [2]:
def load(language):
    filePath = ".\Datasets\\" + language + "\\"
    
    with open(join(filePath, "train.conllu"), 'r', encoding='utf-8') as file:
        data = file.read()
    train_sentences = cl.parse(data)

    with open(join(filePath, "test.conllu"), 'r', encoding='utf-8') as file:
        data = file.read()
    test_sentences = cl.parse(data)

    with open(join(filePath, "val.conllu"), 'r', encoding='utf-8') as file:
        data = file.read()
    val_sentences = cl.parse(data)

    return train_sentences, test_sentences, val_sentences

In [7]:
# Fit the HMM model with the train sentences, it generates 2 outputps containing the probabilities that will be used 
# in the decoding phase.
def HMMFit(train_sentences):
    # dictionary of dictionaries
    countTag = {}   # Count(tag(i))
    transitionCount = {}    # Count(tag(i-1), tag(i))
    wordTagCount = {}   # Count(tag(i), word(i))
    tagDistribution = {}    # Count(tag(i-1), tag(i)) / Count(tag(i-1))
    wordDistribution = {}   # Count(tag(i), word(i))

    # complete creation of counTag dictorary
    # partial initialization of transitionCount dictionary
    # partial creation of wordTagCount dictionary
    for sentence in train_sentences:
        for word in sentence:
            if word['lemma'] in countTag:
                countTag[word['lemma']] += 1
            else:
                countTag[word['lemma']] = 1
                transitionCount[word['lemma']] = {}
            
            if word['form'] not in wordTagCount:
                wordTagCount[word['form']] = {}
            if word['lemma'] in wordTagCount[word['form']]:
                wordTagCount[word['form']][word['lemma']] += 1
            else:
                wordTagCount[word['form']][word['lemma']] = 1
    
    # complete inizialization of transitionCount dictionary
    for key in transitionCount.keys():
        for tag in countTag.keys():
            transitionCount[key][tag] = 0

    # complete creation of wordTagCount dictionary
    for word in wordTagCount.keys():
        for tag in countTag.keys():
            if tag not in wordTagCount[word]:
                wordTagCount[word][tag] = 0

    # complete creation of transitionCount dictionary
    # previousTag --> Markov's hypothesis of first grade
    for tag in transitionCount.keys():
        for sentence in train_sentences:
            previousTag = None
            for word in sentence:
                if previousTag == None:
                    previousTag = word['lemma']
                else:
                    if word['lemma'] == tag:
                        if previousTag in transitionCount[tag]:
                            transitionCount[tag][previousTag] +=1
                    previousTag = word['lemma']
    
    # complete creation of tagDistribution dictionary
    # we use the log-probability for better accuracy (since the probability are very small)
    # if the probability is 0, then the log-probability will be log(10**-10) 
    epsilon = 10**-10
    for eventTag in countTag.keys():
        tagDistribution[eventTag] = {}
        for condTag in countTag.keys():
            if transitionCount[eventTag][condTag]/countTag[condTag] == 0:
                tagDistribution[eventTag][condTag] = math.log(epsilon)
            else:
                tagDistribution[eventTag][condTag] = math.log(transitionCount[eventTag][condTag]/countTag[condTag])
            
    # complete creation of wordDistribution dictionary
    # we use the log-probability for better accuracy (since the probability are very small)
    # if the probability is 0, then the log-probability will be log(10**-10)
    for word in wordTagCount.keys():
        wordDistribution[word] = {}
        for tag in countTag.keys():
            if wordTagCount[word][tag]/countTag[tag] == 0:
                wordDistribution[word][tag] = math.log(epsilon)
            else:
                wordDistribution[word][tag] = math.log(wordTagCount[word][tag]/countTag[tag])
        
    return tagDistribution, wordDistribution

In [4]:
it_train_sentences, it_test_sentences, it_val_sentences = load("it")
#en_train_sentences, en_test_sentences, en_val_sentences = load("en")
#es_train_sentences, es_test_sentences, es_val_sentences = load("es")

In [8]:
tagDistribution, wordDisribution = HMMFit(it_train_sentences)