# Imports

In [24]:
import os

from hmmlearn import hmm
from sklearn import preprocessing
import numpy as np
import pandas as pd
from utils.dataset import FileVocabCreator, POSFolderDataSet, POSDataSet

In [2]:
RANDOM_STATE = 99
VOCAB_PATH = os.path.join('data', 'vocab', 'words.json')

STATES = ['A', 'Adv', 'Cj', 'Interj', 'N', 'Num', 'Other', 'Pp', 'Pron', 'Punct', 'V']

In [3]:
with open(VOCAB_PATH, 'r') as f:
    vocab = FileVocabCreator(f).make()
len(vocab)

1836995

In [4]:
DATA_FOLDER = 'data'
TEST_FOLDER = os.path.join(DATA_FOLDER, 'test')
TRAIN_FOLDER = os.path.join(DATA_FOLDER, 'train')

In [6]:
test_data = POSFolderDataSet(TEST_FOLDER, 'test')
train_data = POSFolderDataSet(TRAIN_FOLDER, 'train')

# Hidden Markov Models (HMM)

In [11]:
from utils.utils import get_emissions, get_initial_prob, get_transitions
from utils.dataset import UNK_TOKEN

In [10]:
#for computing unk word probabilities
unk_df = pd.DataFrame({'init_words': [[UNK_TOKEN] * len(STATES)], 'mod_words': [[UNK_TOKEN] * len(STATES)], 'pos_tags': [STATES]})

In [12]:
def compute_df(df):
    emissions = get_emissions(pd.concat([df, unk_df]), 'pos_tags', 'mod_words')
    init_probs = get_initial_prob(df, 'pos_tags')
    transitions = get_transitions(df, 'pos_tags')
    return emissions, init_probs, transitions

In [14]:
emissions, init_probs, transitions = compute_df(train_data[0]['data'])

In [16]:
le = preprocessing.LabelEncoder()
le.fit_transform(STATES)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [17]:
# encode
emissions_encoded = np.zeros((len(le.classes_), len(vocab)))
for pos, values in emissions.items():
    pos_idx = le.transform([pos])[0]
    for word, prob in values.items():
        emissions_encoded[pos_idx, vocab[word]] = prob

init_probs_encoded = np.zeros(len(le.classes_))
for pos, prob in init_probs.items():
    pos_idx = le.transform([pos])[0]
    init_probs_encoded[pos_idx] = prob

transitions_encoded = np.zeros((len(le.classes_), len(le.classes_)))
for pos_from, values in transitions.items():
    pos_from_idx = le.transform([pos_from])[0]
    for pos_to, prob in values.items():
        pos_to_idx = le.transform([pos_to])[0]
        transitions_encoded[pos_from_idx, pos_to_idx] = prob

In [18]:
print(init_probs_encoded)

[0.15037112 0.15138859 0.06172808 0.00165977 0.30162165 0.03884549
 0.00470618 0.0045456  0.20911582 0.02812602 0.04789167]


In [19]:
gen_model = hmm.CategoricalHMM(n_components=len(init_probs), random_state=RANDOM_STATE)
gen_model.startprob_ = init_probs_encoded
gen_model.emissionprob_ = emissions_encoded
gen_model.transmat_ = transitions_encoded

In [20]:
sent = 'ეს არის სამართალი ? '.split()

In [21]:
pred = gen_model.predict(np.array([vocab[word] for word in sent]).reshape(1, -1))
print([le.inverse_transform([tag])[0] for tag in pred])

['Pron', 'V', 'N', 'Punct']


In [25]:
from sklearn.metrics import classification_report
test_df = POSDataSet(test_data[0]['data'])
y_true = []
y_pred = []
for idx, (x, y) in enumerate(test_df):
    pred = [le.inverse_transform([tag])[0] for tag in  gen_model.predict(np.array([vocab[word] for word in x]).reshape(1, -1))]
    y_pred += pred
    y_true += y

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           A       0.78      0.77      0.77    490590
         Adv       0.95      0.81      0.87    288570
          Cj       1.00      0.73      0.85    229693
      Interj       0.99      0.76      0.86      1661
           N       0.99      0.75      0.85   1348773
         Num       0.99      0.82      0.90     70642
       Other       1.00      0.50      0.66      5269
          Pp       0.90      0.80      0.84     49314
        Pron       0.98      0.84      0.91    285198
       Punct       1.00      0.74      0.85    768481
           V       0.41      0.99      0.58    491621

    accuracy                           0.79   4029812
   macro avg       0.91      0.77      0.81   4029812
weighted avg       0.89      0.79      0.81   4029812



In [53]:
words, pos_tags = gen_model.sample(10)

In [54]:
print([vocab.get_itos()[int(word)] for word in words])
print([le.inverse_transform([tag])[0] for tag in pos_tags])

KeyboardInterrupt: 

# Conditional Random Fields (CRFs)

In [2]:
import os
TOKENIZED_DATASET_PATH = os.path.join('data', 'dataset', 'tokenized')

In [3]:
from datasets import load_from_disk
tokenized_datasets = load_from_disk(TOKENIZED_DATASET_PATH)

In [5]:
tokenized_datasets['train']

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 5678203
})

In [9]:
max(len(i['input_ids']) for i in tokenized_datasets['test'])

170