# Imports

In [9]:
import os

from hmmlearn import hmm
from sklearn import preprocessing
import numpy as np
import pandas as pd
from utils.dataset import FileVocabCreator, POSFolderDataSet

In [2]:
RANDOM_STATE = 99
VOCAB_PATH = os.path.join('data', 'vocab', 'words.json')

STATES = ['A', 'Adv', 'Cj', 'Interj', 'N', 'Num', 'Other', 'Pp', 'Pron', 'Punct', 'V']

In [3]:
with open(VOCAB_PATH, 'r') as f:
    vocab = FileVocabCreator(f).make()
len(vocab)

1836995

In [4]:
DATA_FOLDER = 'data'
TEST_FOLDER = os.path.join(DATA_FOLDER, 'test')
TRAIN_FOLDER = os.path.join(DATA_FOLDER, 'train')

In [6]:
test_data = POSFolderDataSet(TEST_FOLDER, 'test')
train_data = POSFolderDataSet(TRAIN_FOLDER, 'train')

# Hidden Markov Models (HMM)

In [11]:
from utils.utils import get_emissions, get_initial_prob, get_transitions
from utils.dataset import UNK_TOKEN

In [10]:
#for computing unk word probabilities
unk_df = pd.DataFrame({'init_words': [[UNK_TOKEN] * len(STATES)], 'mod_words': [[UNK_TOKEN] * len(STATES)], 'pos_tags': [STATES]})

In [33]:
def compute_df(df):
    emissions = get_emissions(pd.concat([df, unk_df]), 'pos_tags', 'mod_words')
    init_probs = get_initial_prob(df, 'pos_tags')
    transitions = get_transitions(df, 'pos_tags')
    return emissions, init_probs, transitions

here
here
here


['A', 'Adv', 'Cj', 'Interj', 'N', 'Num', 'Other', 'Pp', 'Pron', 'Punct', 'V']

In [35]:
le = preprocessing.LabelEncoder()
le.fit_transform(states)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [36]:
# encode
emissions_encoded = np.zeros((len(le.classes_), len(vocab)))
for pos, values in emissions.items():
    pos_idx = le.transform([pos])[0]
    for word, prob in values.items():
        emissions_encoded[pos_idx, vocab[word]] = prob

init_probs_encoded = np.zeros(len(le.classes_))
for pos, prob in init_probs.items():
    pos_idx = le.transform([pos])[0]
    init_probs_encoded[pos_idx] = prob

transitions_encoded = np.zeros((len(le.classes_), len(le.classes_)))
for pos_from, values in transitions.items():
    pos_from_idx = le.transform([pos_from])[0]
    for pos_to, prob in values.items():
        pos_to_idx = le.transform([pos_to])[0]
        transitions_encoded[pos_from_idx, pos_to_idx] = prob

In [37]:
print(init_probs_encoded)

[0.11268687 0.15643344 0.0600044  0.00717793 0.32254277 0.04488455
 0.00333297 0.00227498 0.16402136 0.05698543 0.0696553 ]


In [38]:
gen_model = hmm.CategoricalHMM(n_components=len(init_probs), random_state=RANDOM_STATE)
gen_model.startprob_ = init_probs_encoded
gen_model.emissionprob_ = emissions_encoded
gen_model.transmat_ = transitions_encoded

In [76]:
sent = 'ეს არის სამართალი ? '.split()

In [77]:
pred = gen_model.predict(np.array([vocab[word] for word in sent]).reshape(1, -1))
print([le.inverse_transform([tag])[0] for tag in pred])

['Pron', 'V', 'N', 'Punct']


In [90]:
from sklearn.metrics import classification_report
test_data = POSDataSet(test_df)
y_true = []
y_pred = []
for idx, (x, y) in enumerate(test_data):
    pred = [le.inverse_transform([tag])[0] for tag in  gen_model.predict(np.array([vocab[word] for word in x]).reshape(1, -1))]
    y_pred += pred
    y_true += y

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           A       0.78      0.69      0.73   2355454
         Adv       0.88      0.74      0.80   1027093
          Cj       0.99      0.67      0.80   1020428
      Interj       0.97      0.62      0.76      2430
           N       0.98      0.68      0.81   7021435
         Num       0.99      0.72      0.84    369660
       Other       0.90      0.20      0.32     12784
          Pp       0.98      0.52      0.68    308022
        Pron       0.97      0.76      0.85   1069004
       Punct       1.00      0.62      0.77   3132654
           V       0.29      0.99      0.45   1925774

    accuracy                           0.71  18244738
   macro avg       0.89      0.66      0.71  18244738
weighted avg       0.88      0.71      0.75  18244738



In [53]:
words, pos_tags = gen_model.sample(10)

In [54]:
print([vocab.get_itos()[int(word)] for word in words])
print([le.inverse_transform([tag])[0] for tag in pos_tags])

KeyboardInterrupt: 

# Conditional Random Fields (CRFs)