In [1]:
%cd ..

/home/rml/dev/nlp


In [2]:
import numpy as np

In [3]:
# Cargamos las probabilidadeds del modelo HMM
emissionProbDict = np.load('./emissionHMM.npy', allow_pickle=True).item()
transitionProbDict = np.load('./transitionHMM.npy', allow_pickle=True).item()

In [4]:
stateSet = sorted(set([w.split('|')[1] for w in emissionProbDict.keys()]))

In [5]:
tagStateDict = {}
for i, state in enumerate(stateSet):
    tagStateDict[state] = i
tagStateDict

{'ADJ': 0,
 'ADP': 1,
 'ADV': 2,
 'AUX': 3,
 'CCONJ': 4,
 'DET': 5,
 'INTJ': 6,
 'NOUN': 7,
 'NUM': 8,
 'PART': 9,
 'PRON': 10,
 'PROPN': 11,
 'PUNCT': 12,
 'SCONJ': 13,
 'SYM': 14,
 'VERB': 15,
 '_': 16}

In [6]:
from conllu import parse_incr

In [7]:
wordList = []
data_file = open("./resources/es_ancora-ud-dev.conllu", "r", encoding="utf8")
count = 0
initTagStateProb = {}
for tokenlist in parse_incr(data_file):
    count += 1
    tag = tokenlist[0]['upos']
    if tag in initTagStateProb.keys():
        initTagStateProb[tag] += 1
    else:
        initTagStateProb[tag] = 1
    
for key in initTagStateProb.keys():
    initTagStateProb[key] /= count

initTagStateProb

{'DET': 0.36275695284159615,
 'PROPN': 0.1124546553808948,
 'ADP': 0.15538089480048367,
 'PRON': 0.06348246674727932,
 'SCONJ': 0.02418379685610641,
 'ADV': 0.056831922611850064,
 'PUNCT': 0.08222490931076179,
 'VERB': 0.021160822249093107,
 'ADJ': 0.010882708585247884,
 'CCONJ': 0.032648125755743655,
 'NOUN': 0.02720677146311971,
 '_': 0.009068923821039904,
 'INTJ': 0.0006045949214026602,
 'AUX': 0.019347037484885126,
 'NUM': 0.01995163240628779,
 'PART': 0.0018137847642079807}

In [8]:
sum(initTagStateProb.values())

0.9999999999999999

In [9]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize

[nltk_data] Downloading package punkt to /home/rml/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
def ViterbiMatrix(secuencia,
                  transitionProbDict=transitionProbDict,
                  emissionProbDict=emissionProbDict,
                  tagStageDict=tagStateDict,
                  initTagStateProb=initTagStateProb):
    # inicialización de la primera columna
    seq = word_tokenize(secuencia)
    viterbiProb = np.zeros((len(tagStageDict), len(seq)))
    
    for key in tagStageDict.keys():
        tag_row = tagStageDict[key]
        word_tag = "{}|{}".format(seq[0].lower(), key)
        if word_tag in emissionProbDict.keys():
            viterbiProb[tag_row, 0] = initTagStateProb[key] * emissionProbDict[word_tag]
            
    # siguientes columnas
    for col in range(1, len(seq)):
        for key in tagStateDict.keys():
            tag_row = tagStateDict[key]
            word_tag = "{}|{}".format(seq[col].lower(), key)
            if word_tag in emissionProbDict.keys():
                possible_probs = []
                for key2 in tagStateDict.keys():
                    tag_row2 = tagStateDict[key2]
                    tag_prevtag = "{}|{}".format(key, key2)
                    if tag_prevtag in transitionProbDict.keys():
                        if viterbiProb[tag_row2, col-1] > 0:
                            possible_probs.append(viterbiProb[tag_row2, col-1]*transitionProbDict[tag_prevtag]*emissionProbDict[word_tag])
                viterbiProb[tag_row, col] = max(possible_probs)
            
        
    return viterbiProb
    

In [16]:
np.set_printoptions(suppress=True)

In [17]:
matrix = ViterbiMatrix('el mundo es pequeño')
# list(map(lambda x: tagStateDict[x], np.argmax(matrix, axis=0)))
matrix

array([[0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.00000003, 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.12461736, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.00000402, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.00000365, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ]])