In [None]:
import numpy as np

def viterbi(observations, state_graph):
    N = len(state_graph)
    T = len(observations)
    viterbi = np.zeros((N, T))
    backpointer = np.zeros((N, T), dtype=int)

    # Initialization step
    for s in range(N):
        viterbi[s, 0] = state_graph[s].pi * state_graph[s].b[observations[0]]
        backpointer[s, 0] = 0

    # Recursion step
    for t in range(1, T):
        for s in range(N):
            viterbi[s, t] = max([viterbi[s_prime, t-1] * state_graph[s_prime].a[s] * state_graph[s].b[observations[t]] for s_prime in range(N)])
            backpointer[s, t] = np.argmax([viterbi[s_prime, t-1] * state_graph[s_prime].a[s] for s_prime in range(N)])

    # Termination step
    best_path_prob = max([viterbi[s, T-1] for s in range(N)])
    best_path_pointer = np.argmax([viterbi[s, T-1] for s in range(N)])

    # Path backtracking
    best_path = [best_path_pointer]
    for t in range(T-1, 0, -1):
        best_path_pointer = backpointer[best_path_pointer, t]
        best_path.insert(0, best_path_pointer)

    return best_path, best_path_prob

In [2]:
import pandas as pd

df = pd.read_csv("NER.csv", encoding='latin1')
print(df)

          Sentence #           Word  POS Tag
0        Sentence: 1      Thousands  NNS   O
1                NaN             of   IN   O
2                NaN  demonstrators  NNS   O
3                NaN           have  VBP   O
4                NaN        marched  VBN   O
...              ...            ...  ...  ..
1048570          NaN           they  PRP   O
1048571          NaN      responded  VBD   O
1048572          NaN             to   TO   O
1048573          NaN            the   DT   O
1048574          NaN         attack   NN   O

[1048575 rows x 4 columns]


In [5]:
# Create vocabulary
vocab = set(df['Word'])

# Create set of tags
tags = set(df['Tag'])


In [6]:
len(vocab), len(tags)

(35178, 17)