# All imports necessary

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import sys

In [3]:
sys.path.append('..')

In [4]:
import os

In [5]:
import numpy as np

In [6]:
import pandas as pd

In [7]:
from source.code.utils.utils import filter_by_subcorpus
from source.code.utils.utils import get_tagged_texts_as_pd

from source.code.transformers.sentenceextractor import SentenceExtractor
from source.code.models.memorytagger import MemoryTagger
from source.code.models.bilstmtagger import BiLSTMTagger

from source.code.utils.preprocessing import filtrations
from source.code.utils.preprocessing import additional_features
from source.code.utils.preprocessing import crf_pre_processing

Using TensorFlow backend.


In [8]:
from sklearn_crfsuite.metrics import flat_classification_report as crfsuite_classification_report

from sklearn_crfsuite import metrics

from sklearn_crfsuite import CRF as sklearn_crf

In [9]:
from seqeval.metrics import classification_report as seqeval_classification_report

In [10]:
from sklearn.metrics import classification_report as sklearn_classification_report

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score

In [11]:
from keras.preprocessing.sequence import pad_sequences

In [12]:
import seaborn as sns

In [13]:
import matplotlib.pyplot as plt

In [14]:
sns.set(color_codes=True)

In [15]:
sns.set(font_scale=2)

In [16]:
%matplotlib inline

In [17]:
features = ['semantic_relation_tagged', 'animacy_tagged', 'lambda_dsr_len', 'word_sense_exists', 'is_title', 'contains_digits', 'word_len']
target = 'ner_tag'

In [18]:
pd.set_option('display.max_rows', 30000)

# Naive tag frequency memorization

In [None]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

In [None]:
X = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [None]:
X.ner_tag.value_counts()

In [None]:
X = filtrations(X)

In [None]:
X.ner_tag.value_counts()

In [None]:
X = additional_features(X)

In [None]:
X.info()

In [None]:
X, y = X.token, X.ner_tag

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [None]:
estimator = MemoryTagger()

In [None]:
scores = cross_val_score(estimator, X_train, y_train, cv=5, verbose=True, n_jobs=-1)

In [None]:
scores

In [None]:
estimator.fit(X_train, y_train)

In [None]:
sklearn_report = sklearn_classification_report(y_pred=estimator.predict(X_test), y_true=y_test)
print(sklearn_report)

# Random Forest Classifier

In [None]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

In [None]:
X = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [None]:
X.ner_tag.value_counts()

In [None]:
X = filtrations(X)

In [None]:
X.ner_tag.value_counts()

In [None]:
X = additional_features(X)

In [None]:
X.info()

In [None]:
X, y = X[features], X[target]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [None]:
estimator = RandomForestClassifier()

In [None]:
scores = cross_val_score(estimator, X_train, y_train, cv=5, verbose=True, n_jobs=-1)

In [None]:
scores

In [None]:
estimator.fit(X_train, y_train)

In [None]:
report = sklearn_classification_report(y_pred=estimator.predict(X_test), y_true=y_test)
print(report)

# HMM

[This article](https://pdfs.semanticscholar.org/9528/4b31f27b9b8901fdc18554603610ebbc2752.pdf)  gives a full description of how to calculate parameters for Hidden Markov Model.

From [this article](https://www.digitalvidya.com/blog/inroduction-to-hidden-markov-models-using-python/) we have taken Viterbi algorithm.

But first let's read the data again.

In [None]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

In [None]:
X = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [None]:
X.head(30)

In [None]:
X.ner_tag.value_counts()

Lots of redundant tag additions (like -nam, -dow).

We will work without them:

In [None]:
X = filtrations(X, with_dots=True)

In [None]:
X.head(30)

In [None]:
X.ner_tag.value_counts()

In [None]:
X.ner_tag.nunique()

The approach proposed in the article mentioned above is divided into three steps:
- Data preparation;
- Parameter estimation (or training);
- Usage of parameters estimated (or testing).

## Step 1: Data preparation

In [None]:
X, y = SentenceExtractor().fit_transform(X)

In [None]:
lemma_sentence_lenghts = list(map(len, X))

In [None]:
tag_sentence_lenghts = list(map(len, y))

In [None]:
max(lemma_sentence_lenghts)

In [None]:
max(tag_sentence_lenghts)

In [None]:
all(len_lemmas == len_tags for len_lemmas, len_tags in zip(lemma_sentence_lenghts, tag_sentence_lenghts))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Step 2: HMM Parameter Estimation

### Step 2.1: Find states.

In [None]:
states = list(set([tag for sentence in y_train for tag in sentence]))
tag2idx = {t: i for i, t in enumerate(states)}
idx2tag = {i: w for w, i in tag2idx.items()}

In [None]:
len(states)

In [None]:
states

### Step 2.2: Calculate Start probability (π)

In [None]:
pi = np.zeros(len(states))

In [None]:
for sentence in y_train:
    if len(sentence) > 0:
        pi[tag2idx[sentence[0]]] += 1

In [None]:
pi = pi / len(y_train)

### Step 2.3: Calculate transition probability (A)

In [None]:
A = np.zeros((len(states), len(states)))
A_i = np.zeros(len(states))

In [None]:
for i in range(len(y_train)):
    if len(y_train[i]) > 1:
        for j in range(len(y_train[i]) - 1):
            A[tag2idx[y_train[i][j]]][tag2idx[y_train[i][j + 1]]] += 1
            A_i[tag2idx[y_train[i][j]]] += 1
        A_i[tag2idx[y_train[i][len(y_train[i]) - 1]]] += 1
for i in range(len(states)):
    A[i] = A[i] / A_i[i]

In [None]:
plt.figure(figsize=(20, 12))
sns.heatmap(A, annot=True, xticklabels=states, yticklabels=states, fmt='.2g')
plt.tight_layout()
plt.show()

### Step 2.4: Calculate emission probability (B)

In [None]:
words = list(set([word for sentence in X_train for word in sentence]))
words.append('unknown_word')
word2idx = {t: i for i, t in enumerate(words)}
idx2word = {i: w for w, i in word2idx.items()}

In [None]:
len(words)

In [None]:
B = np.zeros((len(words), len(states)))
B_i = np.zeros(len(states))

In [None]:
for i in range(len(X_train)):
    if len(X_train[i]) > 0:
        for j in range(len(X_train[i])):
            B[
                word2idx[
                    X_train[i][j]
                ]
            ][
                tag2idx[
                    y_train[i][j]
                ]
            ] += 1
            B_i[tag2idx[y_train[i][j]]] += 1
for i in range(len(states)):
    B[:, i] = B[:, i] / B_i[i]

In [None]:
np.shape(pi)

In [None]:
np.shape(A)

In [None]:
np.shape(B.T)

## Step 3: Testing

### Step 3.1: Viterby implementation

In [None]:
def viterbi(pi, A, B, obs):
    nStates = np.shape(B)[0]
    T = np.shape(obs)[0]
    path = np.zeros(T)
    delta = np.zeros((nStates, T))
    phi = np.zeros((nStates, T))

    try:
        delta[:, 0] = pi * B[:, obs[0]]
        phi[:, 0] = 0
    except:
        print('\t', len(obs))

    for t in range(1, T):
        for s in range(nStates):
            delta[s, t] = np.max(delta[:, t - 1] * A[:, s]) * B[s, obs[t]]
            phi[s, t] = np.argmax(delta[:, t - 1] * A[:, s])

    path[T - 1] = np.argmax(delta[:, T - 1])
    for t in range(T - 2, -1, -1):
        #path[t] = phi[int(path[t+1]): int(t+1) , int(t+1)]
        path[t] = phi[int(path[t + 1]) , int(t + 1)]

    return path, delta, phi

### Step 3.2: Implementation testing

In [None]:
X_test_num = [[word2idx[word] if word in word2idx else word2idx['unknown_word'] for word in sentence] for sentence in X_test]

In [None]:
X_test_num = [sentence for sentence in X_test_num if len(sentence) > 0]

In [None]:
y_test = [sentence.tolist() for sentence in y_test if len(sentence) > 0]

In [None]:
y_pred = []
for sentence in X_test_num:
    path, _, _ = viterbi(pi, A, B.T, sentence)
    y_pred.append([idx2tag[step] for step in path])

In [None]:
crfsuite_report = crfsuite_classification_report(y_pred=y_pred, y_true=y_test)
print(crfsuite_report)

In [None]:
seqeval_report = seqeval_classification_report(y_pred=y_pred, y_true=y_test)
print(seqeval_report)

# CRF

In [19]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

HBox(children=(IntProgress(value=0, description='Read folders: '), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Filter folders: ', max=10000), HTML(value='')))




In [20]:
X = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [21]:
X.ner_tag.value_counts()

O          1032479
geo-nam      55480
org-nam      44646
per-nam      22931
gpe-nam      19685
tim-dow      11398
tim-dat      10929
per-tit       9672
per-fam       8098
[]            4064
tim-moy       3811
tim-yoc       3009
per-giv       2376
tim-clo        810
art-nam        789
eve-nam        514
nat-nam        280
tim-nam        132
eve-ord         63
per-ini         55
per-ord         35
org-leg         13
tim-dom          8
art-add          1
per-mid          1
Name: ner_tag, dtype: int64

In [22]:
X = filtrations(X, with_dots=True)

HBox(children=(IntProgress(value=0, description='Punctuation without dots: ', max=1231279), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Target tags: ', max=780339), HTML(value='')))




HBox(children=(IntProgress(value=0, description='IOB to BIO: ', max=780339), HTML(value='')))




In [23]:
X.ner_tag.value_counts()

O        590495
B-geo     42533
B-org     23134
B-tim     22603
I-per     22211
B-per     20861
I-org     19331
B-gpe     19104
I-geo     12438
I-tim      5481
I-gpe       579
B-art       409
I-art       322
B-eve       310
I-eve       248
B-nat       208
I-nat        72
Name: ner_tag, dtype: int64

In [24]:
X = additional_features(X)

HBox(children=(IntProgress(value=0, description='Semantic relation: ', max=780339), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Animacy tagged: ', max=780339), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Lambda-DSR len: ', max=780339), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Word sense: ', max=780339), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Is title: ', max=780339), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Contains digits: ', max=780339), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Word len: ', max=780339), HTML(value='')))




In [25]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 780339 entries, 0 to 1231278
Data columns (total 17 columns):
token                       780339 non-null object
pos_tag                     780339 non-null object
lemma                       780339 non-null object
ner_tag                     780339 non-null object
word_net_sense_number       780339 non-null object
verb_net_roles              780339 non-null object
semantic_relation           780339 non-null object
animacy_tag                 780339 non-null object
super_tag                   780339 non-null object
lambda_dsr                  780339 non-null object
semantic_relation_tagged    780339 non-null int64
animacy_tagged              780339 non-null int64
lambda_dsr_len              780339 non-null int64
word_sense_exists           780339 non-null int64
is_title                    780339 non-null int64
contains_digits             780339 non-null int64
word_len                    780339 non-null int64
dtypes: int64(7), object(10)

In [26]:
X, y = crf_pre_processing(X)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [28]:
estimator = sklearn_crf(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False)

In [29]:
scores = cross_val_score(estimator, X_train, y_train, cv=5, verbose=True, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  6.5min finished


In [30]:
scores

array([0.97782354, 0.9767841 , 0.97776573, 0.97667006, 0.97832773])

In [31]:
estimator.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=False, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [32]:
report = crfsuite_classification_report(y_pred=estimator.predict(X_test), y_true=y_test)
print(report)

              precision    recall  f1-score   support

       B-art       0.77      0.33      0.46       146
       B-eve       0.65      0.42      0.51       116
       B-geo       0.95      0.96      0.96     13941
       B-gpe       0.98      0.97      0.98      6329
       B-nat       0.86      0.65      0.74        78
       B-org       0.90      0.88      0.89      7629
       B-per       0.90      0.89      0.90      6861
       B-tim       0.97      0.95      0.96      7456
       I-art       0.79      0.38      0.51       119
       I-eve       0.60      0.35      0.44        98
       I-geo       0.95      0.97      0.96      3999
       I-gpe       0.89      0.75      0.81       172
       I-nat       1.00      0.77      0.87        26
       I-org       0.92      0.95      0.94      6375
       I-per       0.91      0.95      0.93      7271
       I-tim       0.93      0.90      0.91      1846
           O       0.99      0.99      0.99    175612

   micro avg       0.98   

In [34]:
seqeval_report = seqeval_classification_report(y_pred=estimator.predict(X_test), y_true=y_test)
print(seqeval_report)

AttributeError: 'numpy.ndarray' object has no attribute 'split'

# Bi-LSTM

In [None]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

In [None]:
X = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [None]:
X.head(30)

In [None]:
X.ner_tag.value_counts()

In [None]:
X = filtrations(X, with_dots=True)

In [None]:
X.ner_tag.value_counts()

In [None]:
X.ner_tag.nunique()

In [None]:
X, y = SentenceExtractor().fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
estimator = BiLSTMTagger(checkpoint_dir='../data/datasets/keras_model/')

In [None]:
estimator.fit(X_train, y_train)

In [None]:
y_pred = estimator.predict(X_test)

In [None]:
y_test = [[estimator.tag2idx[w] for w in s] for s in y_test]
y_test = pad_sequences(maxlen=75, sequences=y_test, padding="post", value=estimator.tag2idx["O"])
y_test = [[estimator.idx2tag[w] for w in s] for s in y_test]

In [None]:
crfsuite_report = crfsuite_classification_report(y_pred=y_pred, y_true=y_test)
print(crfsuite_report)

In [None]:
seqeval_report = seqeval_classification_report(y_pred=y_pred, y_true=y_test)
print(seqeval_report)

# Conclusion