In [20]:
from conllu import parse_incr
from hmm_viterbi import HiddenMarkovModel

import numpy as np

In [18]:
%load_ext autoreload
%autoreload 2

In [3]:
# Path to the dataset file
path = "../data/UD_English-EWT/en_ewt-ud-test.conllu"

In [4]:
# Read and parse

# A list of sentences
ds_source = []

# Open the file and let conllu parse the dataset into sentences
with open(path, "r", encoding="utf-8") as f:
    for sentence in parse_incr(f):

        # A list of words, with several annotations like POS, Lemmatized version, and other morphological features
        ds_item = []
        for annotation in sentence:

            # Some words are a combination of other words, like "He's" is made of He and is.
            # In these cases, the id is a tuple first, followed by the actual words with int ids.
            # Therefore, we can safely skip those tuple id entries.
            if not isinstance(annotation['id'], int): continue

            # For our use case, we only require the actual word and its POS tag
            word, tag = annotation['form'], annotation['upos']

            ds_item.append((word, tag))
        ds_source.append(ds_item)

print(f"Loaded {len(ds_source)} sentences")

Loaded 2077 sentences


In [5]:
# Inspect one sentence
first = ds_source[0]
print(first)

[('What', 'PRON'), ('if', 'SCONJ'), ('Google', 'PROPN'), ('Morphed', 'VERB'), ('Into', 'ADP'), ('GoogleOS', 'PROPN'), ('?', 'PUNCT')]


In [6]:
first[3]

('Morphed', 'VERB')

In [7]:
list({'a', 2})

[2, 'a']

In [8]:
x = np.random.randint(0, 10, (4, 10))

In [9]:
print(x)

[[3 9 4 1 2 7 8 6 5 6]
 [4 3 7 8 6 4 0 6 4 8]
 [2 9 0 3 9 2 1 4 4 8]
 [9 4 6 4 0 6 5 8 9 5]]


In [10]:
print(x / np.sum(x, axis=1, keepdims=True))

[[0.05882353 0.17647059 0.07843137 0.01960784 0.03921569 0.1372549
  0.15686275 0.11764706 0.09803922 0.11764706]
 [0.08       0.06       0.14       0.16       0.12       0.08
  0.         0.12       0.08       0.16      ]
 [0.04761905 0.21428571 0.         0.07142857 0.21428571 0.04761905
  0.02380952 0.0952381  0.0952381  0.19047619]
 [0.16071429 0.07142857 0.10714286 0.07142857 0.         0.10714286
  0.08928571 0.14285714 0.16071429 0.08928571]]


In [22]:
m = HiddenMarkovModel(ds_source)

In [24]:
m.initial_probabilities

array([0.08858931, 0.15406837, 0.00722195, 0.05055368, 0.04333173,
       0.01685123, 0.07366394, 0.02359172, 0.01781416, 0.03322099,
       0.        , 0.07751565, 0.24699085, 0.06836784, 0.06788637,
       0.00240732, 0.02792489])

In [26]:
"Hi"

'Hi'