In [1]:
import pandas as pd
import conllu as cl
from os.path import join
import math

In [32]:
def reconstructSentences(sentences):
    modified_sentences = []
    for sentence in sentences:
        modified_sentence = []
        # create a start token
        new_token_start = {
            "id": len(sentence),
            "form": "<s>",
            "lemma": "START",
        }
        # create a end token
        new_token_end = {
            "id": len(sentence) + 1,
            "form": "</s>",
            "lemma": "END",
        }
        # append the start token
        modified_sentence.append(new_token_start)
        # append the rest of the sentence
        for word in sentence:
            modified_sentence.append(word)
        # append the end token
        modified_sentence.append(new_token_end)

        # append the new sentence to the list of sentences
        modified_sentences.append(modified_sentence)

    return modified_sentences

In [11]:
def load(language):
    filePath = ".\Datasets\\" + language + "\\"
    
    with open(join(filePath, "corpus.conllu"), 'r', encoding='utf-8') as file:
        data = file.read()
    train_sentences = reconstructSentences(cl.parse(data))

    with open(join(filePath, "testCorpus.conllu"), 'r', encoding='utf-8') as file:
        data = file.read()
    test_sentences = cl.parse(data)

    with open(join(filePath, "val.conllu"), 'r', encoding='utf-8') as file:
        data = file.read()
    val_sentences = cl.parse(data)

    return train_sentences, test_sentences, val_sentences

In [3]:
# Fit the HMM model with the train sentences, it generates 2 outputps containing the probabilities that will be used 
# in the decoding phase.
def HMMFit(train_sentences):
    # dictionary of dictionaries
    countTag = {}   # Count(tag(i))
    transitionCount = {}    # Count(tag(i-1), tag(i))
    wordTagCount = {}   # Count(tag(i), word(i))
    tagDistribution = {}    # Count(tag(i-1), tag(i)) / Count(tag(i-1))
    wordDistribution = {}   # Count(tag(i), word(i))

    # complete creation of counTag dictorary
    # partial initialization of transitionCount dictionary
    # partial creation of wordTagCount dictionary
    for sentence in train_sentences:
        for word in sentence:
            if word['lemma'] in countTag:
                countTag[word['lemma']] += 1
            else:
                countTag[word['lemma']] = 1
                transitionCount[word['lemma']] = {}
            
            if word['form'] not in wordTagCount:
                wordTagCount[word['form']] = {}
            if word['lemma'] in wordTagCount[word['form']]:
                wordTagCount[word['form']][word['lemma']] += 1
            else:
                wordTagCount[word['form']][word['lemma']] = 1
    
    # complete inizialization of transitionCount dictionary
    for key in transitionCount.keys():
        for tag in countTag.keys():
            transitionCount[key][tag] = 0

    # complete creation of wordTagCount dictionary
    for word in wordTagCount.keys():
        for tag in countTag.keys():
            if tag not in wordTagCount[word]:
                wordTagCount[word][tag] = 0

    # complete creation of transitionCount dictionary
    # previousTag --> Markov's hypothesis of first grade
    for tag in transitionCount.keys():
        for sentence in train_sentences:
            previousTag = None
            for word in sentence:
                if previousTag == None:
                    previousTag = word['lemma']
                else:
                    if word['lemma'] == tag:
                        if previousTag in transitionCount[tag]:
                            transitionCount[tag][previousTag] += 1
                    previousTag = word['lemma']
    
    # complete creation of tagDistribution dictionary
    # we use the log-probability for better accuracy (since the probability are very small)
    # if the probability is 0, then the log-probability will be log(10**-10) 
    epsilon = 10**-10
    for eventTag in countTag.keys():
        tagDistribution[eventTag] = {}
        for condTag in countTag.keys():
            if transitionCount[eventTag][condTag]/countTag[condTag] == 0:
                tagDistribution[eventTag][condTag] = math.log(epsilon)
            else:
                tagDistribution[eventTag][condTag] = math.log(transitionCount[eventTag][condTag]/countTag[condTag])
            
    # complete creation of wordDistribution dictionary
    # we use the log-probability for better accuracy (since the probability are very small)
    # if the probability is 0, then the log-probability will be log(10**-10)
    for word in wordTagCount.keys():
        wordDistribution[word] = {}
        for tag in countTag.keys():
            if wordTagCount[word][tag]/countTag[tag] == 0:
                wordDistribution[word][tag] = math.log(epsilon)
            else:
                wordDistribution[word][tag] = math.log(wordTagCount[word][tag]/countTag[tag])
        
    return tagDistribution, wordDistribution

In [82]:
def viterbiAlgorithm(sentence, tagDistribution, wordDistribution, b_value):
    viterbi = {}
    backpointer = {}

    # todo: add the descrpt.
    for word in sentence:
        viterbi[word['form']] = {}
        backpointer[word['form']] = {}
    viterbi["END"] = {}
    backpointer["END"] = {}

    # initialization step
    for tag in tagDistribution.keys():
        viterbi[(sentence[0])['form']][tag] = tagDistribution[tag]["START"] + b_value
        print("!!!!!!!initialization: " + str(tagDistribution[tag]["START"]))
        backpointer[(sentence[0])['form']][tag] = 0

    # recursion step
    print(len(sentence))
    print(sentence)
    for i in range(1, len(sentence)):
        print((sentence[i])['form'])
        for tag in tagDistribution.keys():
            if tag != "START" and tag != "END":
                viterbi[(sentence[i])['form']][tag] = max(viterbi[(sentence[i-1])['form']][tagLoop] + wordDistribution[(sentence[i])['form']][tag] + tagDistribution[tag][tagLoop] for tagLoop in tagDistribution.keys() if tagLoop not in ["START", "END"])
                #backpointer[(sentence[i])['form']][tag] = max(viterbi[(sentence[i-1])['form']][tagLoop] + tagDistribution[tag][tagLoop] for tagLoop in tagDistribution.keys())
                maxVit, maxTag = max((viterbi[(sentence[i-1])['form']][tagLoop] + tagDistribution[tag][tagLoop], tagLoop) for tagLoop in tagDistribution.keys() if tagLoop not in ["START", "END"])
                print("PAROLA CORRENTE: " + sentence[i]['form'] + " MAXVIT: " + str(maxVit) + " currentTAG: " + tag + " maxtag: " + maxTag)
                backpointer[(sentence[i])['form']][tag] = maxTag

    # termitation step
    viterbi["END"] = max(viterbi[(sentence[len(sentence)-1])['form']][tagLoop] + tagDistribution["END"][tagLoop] for tagLoop in tagDistribution.keys() if tagLoop not in ["START", "END"])
    #backpointer["END"][tag] = max(viterbi[(sentence[len(sentence)])['form']][tagLoop] + tagDistribution["END"][tagLoop] for tagLoop in tagDistribution.keys())
    maxVit, maxTag = max((viterbi[(sentence[len(sentence)-1])['form']][tagLoop] + tagDistribution["END"][tagLoop], tagLoop) for tagLoop in tagDistribution.keys() if tagLoop not in ["START", "END"])
    backpointer["END"] = maxTag

    return viterbi, backpointer


In [77]:
def HMMPredict(test_senteces, tagDistribution, wordDistribution):
    b_value = 0 # in log probabilities 0 is the neutral element
    solution = {}
    backpointer = {}
    
    for sentence in test_senteces:
        solution, backpointer = viterbiAlgorithm(sentence, tagDistribution, wordDistribution, b_value)
        # recontruct and add to the solution...

    return solution, backpointer

In [33]:
it_train_sentences, it_test_sentences, it_val_sentences = load("it")
#en_train_sentences, en_test_sentences, en_val_sentences = load("en")
#es_train_sentences, es_test_sentences, es_val_sentences = load("es")

[[{'id': 2, 'form': '<s>', 'lemma': 'START'},
  {'id': 0, 'form': 'Paolo', 'lemma': 'N'},
  {'id': 1, 'form': 'pesca', 'lemma': 'V'},
  {'id': 3, 'form': '</s>', 'lemma': 'END'}],
 [{'id': 4, 'form': '<s>', 'lemma': 'START'},
  {'id': 0, 'form': 'Giovanni', 'lemma': 'N'},
  {'id': 1, 'form': 'ama', 'lemma': 'V'},
  {'id': 2, 'form': 'i', 'lemma': 'A'},
  {'id': 3, 'form': 'cani', 'lemma': 'N'},
  {'id': 5, 'form': '</s>', 'lemma': 'END'}],
 [{'id': 2, 'form': '<s>', 'lemma': 'START'},
  {'id': 0, 'form': 'Francesca', 'lemma': 'N'},
  {'id': 1, 'form': 'ama', 'lemma': 'N'},
  {'id': 3, 'form': '</s>', 'lemma': 'END'}],
 [{'id': 3, 'form': '<s>', 'lemma': 'START'},
  {'id': 0, 'form': 'Una', 'lemma': 'A'},
  {'id': 1, 'form': 'pesca', 'lemma': 'N'},
  {'id': 2, 'form': 'Francesca', 'lemma': 'AGG'},
  {'id': 4, 'form': '</s>', 'lemma': 'END'}]]

In [55]:
tagDistribution, wordDistribution = HMMFit(it_train_sentences)
tagDistribution

{'START': {'START': -23.025850929940457,
  'N': -23.025850929940457,
  'V': -23.025850929940457,
  'END': -23.025850929940457,
  'A': -23.025850929940457,
  'AGG': -23.025850929940457},
 'N': {'START': -0.2876820724517809,
  'N': -1.791759469228055,
  'V': -23.025850929940457,
  'END': -23.025850929940457,
  'A': 0.0,
  'AGG': -23.025850929940457},
 'V': {'START': -23.025850929940457,
  'N': -1.0986122886681098,
  'V': -23.025850929940457,
  'END': -23.025850929940457,
  'A': -23.025850929940457,
  'AGG': -23.025850929940457},
 'END': {'START': -23.025850929940457,
  'N': -1.0986122886681098,
  'V': -0.6931471805599453,
  'END': -23.025850929940457,
  'A': -23.025850929940457,
  'AGG': 0.0},
 'A': {'START': -1.3862943611198906,
  'N': -23.025850929940457,
  'V': -0.6931471805599453,
  'END': -23.025850929940457,
  'A': -23.025850929940457,
  'AGG': -23.025850929940457},
 'AGG': {'START': -23.025850929940457,
  'N': -1.791759469228055,
  'V': -23.025850929940457,
  'END': -23.0258509299

In [83]:
HMMPredict(it_test_sentences, tagDistribution, wordDistribution)

!!!!!!!initialization: -23.025850929940457
!!!!!!!initialization: -0.2876820724517809
!!!!!!!initialization: -23.025850929940457
!!!!!!!initialization: -23.025850929940457
!!!!!!!initialization: -1.3862943611198906
!!!!!!!initialization: -23.025850929940457
3
TokenList<Paolo, ama, Francesca>
ama
PAROLA CORRENTE: ama MAXVIT: -1.3862943611198906 currentTAG: N maxtag: A
PAROLA CORRENTE: ama MAXVIT: -1.3862943611198908 currentTAG: V maxtag: N
PAROLA CORRENTE: ama MAXVIT: -23.313533002392237 currentTAG: A maxtag: N
PAROLA CORRENTE: ama MAXVIT: -2.0794415416798357 currentTAG: AGG maxtag: N
Francesca
PAROLA CORRENTE: Francesca MAXVIT: -4.969813299576 currentTAG: N maxtag: N
PAROLA CORRENTE: Francesca MAXVIT: -4.276666119016055 currentTAG: V maxtag: N
PAROLA CORRENTE: Francesca MAXVIT: -2.7725887222397816 currentTAG: A maxtag: V
PAROLA CORRENTE: Francesca MAXVIT: -4.969813299576 currentTAG: AGG maxtag: N


({'Paolo': {'START': -23.025850929940457,
   'N': -0.2876820724517809,
   'V': -23.025850929940457,
   'END': -23.025850929940457,
   'A': -1.3862943611198906,
   'AGG': -23.025850929940457},
  'ama': {'N': -3.1780538303479453,
   'V': -2.079441541679836,
   'A': -46.339383932332694,
   'AGG': -25.10529247162029},
  'Francesca': {'N': -6.761572768804054,
   'V': -27.302517048956513,
   'A': -25.79843965218024,
   'AGG': -4.969813299576},
  'END': -4.969813299576},
 {'Paolo': {'START': 0, 'N': 0, 'V': 0, 'END': 0, 'A': 0, 'AGG': 0},
  'ama': {'N': 'A', 'V': 'N', 'A': 'N', 'AGG': 'N'},
  'Francesca': {'N': 'N', 'V': 'N', 'A': 'V', 'AGG': 'N'},
  'END': 'AGG'})