# All imports necessary

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
import sys

sys.path.append('..')

In [None]:
import os

In [None]:
import numpy as np

In [None]:
import pandas as pd

In [None]:
from source.code.utils.utils import filter_by_subcorpus
from source.code.utils.utils import get_tagged_texts_as_pd

from source.code.transformers.sentenceextractor import SentenceExtractor
from source.code.models.memorytagger import MemoryTagger
from source.code.models.bilstmtagger import BiLSTMTagger

from source.code.utils.preprocessing import filtrations
from source.code.utils.preprocessing import additional_features
from source.code.utils.preprocessing import crf_pre_processing

In [None]:
from sklearn_crfsuite import CRF as sklearn_crf

In [None]:
from seqeval.metrics import classification_report as seqeval_classification_report

In [None]:
from sklearn.metrics import classification_report as sklearn_classification_report

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score

In [None]:
from keras.preprocessing.sequence import pad_sequences

In [None]:
import seaborn as sns

In [None]:
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

In [None]:
features = ['is_title', 'contains_digits', 'word_len']
target = 'ner_tag'

# Naive tag frequency memorization

## Data reading

In [None]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

In [None]:
X = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [None]:
X.ner_tag.value_counts()

In [None]:
X = filtrations(X)

In [None]:
X.ner_tag.value_counts()

In [None]:
X = additional_features(X)

In [None]:
X.info()

In [None]:
X, y = X.token, X.ner_tag

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [None]:
estimator = MemoryTagger()

In [None]:
scores = cross_val_score(estimator, X_train, y_train, cv=5, verbose=True, n_jobs=-1)

In [None]:
scores

In [None]:
estimator.fit(X_train, y_train)

## Testing

In [None]:
sklearn_report = sklearn_classification_report(y_pred=estimator.predict(X_test), y_true=y_test)
print(sklearn_report)

# Random Forest Classifier

## Data reading

In [None]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

In [None]:
X = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [None]:
X.ner_tag.value_counts()

In [None]:
X = filtrations(X)

In [None]:
X.ner_tag.value_counts()

In [None]:
X = additional_features(X)

In [None]:
X.info()

In [None]:
X, y = X[features], X[target]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [None]:
estimator = RandomForestClassifier()

In [None]:
scores = cross_val_score(estimator, X_train, y_train, cv=5, verbose=True, n_jobs=-1)

In [None]:
scores

In [None]:
estimator.fit(X_train, y_train)

## Testing

In [None]:
report = sklearn_classification_report(y_pred=estimator.predict(X_test), y_true=y_test)
print(report)

# HMM

[This article](https://pdfs.semanticscholar.org/9528/4b31f27b9b8901fdc18554603610ebbc2752.pdf) gives a full description of what parameters of HMM should be calculated.

From [this article](https://www.digitalvidya.com/blog/inroduction-to-hidden-markov-models-using-python/) the Viterbi algorithm implementation was taken.

In [None]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

In [None]:
X = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [None]:
X = filtrations(X, with_dots=True)

## Step 1: Data preparation

In [None]:
X, y = SentenceExtractor().fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Step 2: HMM Parameter Estimation

### Step 2.1: Find states.

In [None]:
states = list(set([tag for sentence in y_train for tag in sentence]))
tag2idx = {t: i for i, t in enumerate(states)}
idx2tag = {i: w for w, i in tag2idx.items()}

In [None]:
len(states)

In [None]:
states

### Step 2.2: Calculate Start probability (π)

In [None]:
pi = np.zeros(len(states))

In [None]:
for sentence in y_train:
    if len(sentence) > 0:
        pi[tag2idx[sentence[0]]] += 1

In [None]:
pi = pi / len(y_train)

### Step 2.3: Calculate transition probability (A)

In [None]:
A = np.zeros((len(states), len(states)))
A_i = np.zeros(len(states))

In [None]:
for i in range(len(y_train)):
    if len(y_train[i]) > 1:
        for j in range(len(y_train[i]) - 1):
            A[tag2idx[y_train[i][j]]][tag2idx[y_train[i][j + 1]]] += 1
            A_i[tag2idx[y_train[i][j]]] += 1
        A_i[tag2idx[y_train[i][len(y_train[i]) - 1]]] += 1
for i in range(len(states)):
    A[i] = A[i] / A_i[i]

In [None]:
plt.figure(figsize=(20, 12))
sns.heatmap(A, annot=True, xticklabels=states, yticklabels=states, fmt='.2g')
plt.tight_layout()
plt.show()

### Step 2.4: Calculate emission probability (B)

In [None]:
words = list(set([word for sentence in X_train for word in sentence]))
words.append('unknown_word')
word2idx = {t: i for i, t in enumerate(words)}
idx2word = {i: w for w, i in word2idx.items()}

In [None]:
len(words)

In [None]:
B = np.zeros((len(words), len(states)))
B_i = np.zeros(len(states))

In [None]:
for i in range(len(X_train)):
    if len(X_train[i]) > 0:
        for j in range(len(X_train[i])):
            B[
                word2idx[
                    X_train[i][j]
                ]
            ][
                tag2idx[
                    y_train[i][j]
                ]
            ] += 1
            B_i[tag2idx[y_train[i][j]]] += 1
for i in range(len(states)):
    B[:, i] = B[:, i] / B_i[i]

In [None]:
np.shape(pi)

In [None]:
np.shape(A)

In [None]:
np.shape(B.T)

## Step 3: Testing

### Step 3.1: Viterby implementation

In [None]:
def viterbi(pi, A, B, obs):
    nStates = np.shape(B)[0]
    T = np.shape(obs)[0]
    path = np.zeros(T)
    delta = np.zeros((nStates, T))
    phi = np.zeros((nStates, T))

    try:
        delta[:, 0] = pi * B[:, obs[0]]
        phi[:, 0] = 0
    except:
        print('\t', len(obs))

    for t in range(1, T):
        for s in range(nStates):
            delta[s, t] = np.max(delta[:, t - 1] * A[:, s]) * B[s, obs[t]]
            phi[s, t] = np.argmax(delta[:, t - 1] * A[:, s])

    path[T - 1] = np.argmax(delta[:, T - 1])
    for t in range(T - 2, -1, -1):
        path[t] = phi[int(path[t + 1]) , int(t + 1)]

    return path, delta, phi

### Step 3.2: Implementation testing

In [None]:
X_test_num = [[word2idx[word] if word in word2idx else word2idx['unknown_word'] for word in sentence] for sentence in X_test]

In [None]:
X_test_num = [sentence for sentence in X_test_num if len(sentence) > 0]

In [None]:
y_test = [sentence.tolist() for sentence in y_test if len(sentence) > 0]

In [None]:
y_pred = []
for sentence in X_test_num:
    path, _, _ = viterbi(pi, A, B.T, sentence)
    y_pred.append([idx2tag[step] for step in path])

In [None]:
crfsuite_report = crfsuite_classification_report(y_pred=y_pred, y_true=y_test)
print(crfsuite_report)

In [None]:
seqeval_report = seqeval_classification_report(y_pred=y_pred, y_true=y_test)
print(seqeval_report)

# CRF

## Data reading

In [None]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

In [None]:
X = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [None]:
X.ner_tag.value_counts()

In [None]:
X = filtrations(X, with_dots=True)

In [None]:
X.ner_tag.value_counts()

In [None]:
X = additional_features(X)

In [None]:
X.info()

In [None]:
X, y = crf_pre_processing(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
estimator = sklearn_crf(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False)

In [None]:
scores = cross_val_score(estimator, X_train, y_train, cv=5, verbose=True, n_jobs=-1)

In [None]:
scores

In [None]:
estimator.fit(X_train, y_train)

In [None]:
y_pred = estimator.predict(X_test)

In [None]:
y_test = [sentence.tolist() for sentence in y_test]

## Testing

In [None]:
report = crfsuite_classification_report(y_pred=y_pred, y_true=y_test)
print(report)

In [None]:
seqeval_report = seqeval_classification_report(y_pred=y_pred, y_true=y_test)
print(seqeval_report)

# Bi-LSTM

## Data reading

In [None]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

In [None]:
X = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [None]:
X.head(30)

In [None]:
X.ner_tag.value_counts()

In [None]:
X = filtrations(X, with_dots=True)

In [None]:
X.ner_tag.value_counts()

In [None]:
X.ner_tag.nunique()

In [None]:
X, y = SentenceExtractor().fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
estimator = BiLSTMTagger(checkpoint_dir='../data/datasets/keras_model/')

In [None]:
estimator.fit(X_train, y_train)

In [None]:
y_pred = estimator.predict(X_test)

In [None]:
y_test = [[estimator.tag2idx[w] for w in s] for s in y_test]
y_test = pad_sequences(maxlen=75, sequences=y_test, padding="post", value=estimator.tag2idx["O"])
y_test = [[estimator.idx2tag[w] for w in s] for s in y_test]

## Testing

In [None]:
crfsuite_report = crfsuite_classification_report(y_pred=y_pred, y_true=y_test)
print(crfsuite_report)

In [None]:
seqeval_report = seqeval_classification_report(y_pred=y_pred, y_true=y_test)
print(seqeval_report)

# Conclusion

Итак, в данной работе были предприняты:
- простая попытка запомнить частоту тэгов для конкретных слов и выдача тэга с максимальной частотой для конкретного слова;
- попытка использовать какой-нибудь мульти-классификатор;
- попытка реализовать вариант скрытой марковской модели с использованием алгоритма Витерби для нахождения максимально вероятной цепочки скрытых состояний;
- попытка использовать CRF;
- попытка использовать двунаправленную LSTM.

Для корректного расчета метрик использовалась библиотека [seqeval](https://github.com/chakki-works/seqeval), она позволяет считать метрики с учетом особенностей многословных именованых сущностей.

Лучше всего на тесте себя показал CRF.