# All imports necessary

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import sys

In [3]:
sys.path.append('..')

In [4]:
import os

In [5]:
import numpy as np

In [6]:
import pandas as pd

In [7]:
from source.code.utils.utils import filter_by_subcorpus
from source.code.utils.utils import get_tagged_texts_as_pd

from source.code.transformers.sentenceextractor import SentenceExtractor
from source.code.models.memorytagger import MemoryTagger
from source.code.models.bilstmtagger import BiLSTMTagger

from source.code.utils.preprocessing import filtrations
from source.code.utils.preprocessing import additional_features
from source.code.utils.preprocessing import crf_pre_processing

Using TensorFlow backend.


In [8]:
from sklearn_crfsuite.metrics import flat_classification_report as crfsuite_classification_report

from sklearn_crfsuite import metrics

from sklearn_crfsuite import CRF as sklearn_crf

In [9]:
from seqeval.metrics import classification_report as seqeval_classification_report

In [10]:
from seqeval.metrics import sequence_labeling as seqeval_classification_report

In [11]:
from sklearn.metrics import classification_report as sklearn_classification_report

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score

In [12]:
from keras.preprocessing.sequence import pad_sequences

In [13]:
import seaborn as sns

In [14]:
import matplotlib.pyplot as plt

In [15]:
sns.set(color_codes=True)

In [16]:
sns.set(font_scale=2)

In [17]:
%matplotlib inline

In [18]:
features = ['semantic_relation_tagged', 'animacy_tagged', 'lambda_dsr_len', 'word_sense_exists', 'is_title', 'contains_digits', 'word_len']
target = 'ner_tag'

In [19]:
pd.set_option('display.max_rows', 30000)

# Naive tag frequency memorization

In [None]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

In [None]:
X = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [None]:
X.ner_tag.value_counts()

In [None]:
X = filtrations(X)

In [None]:
X.ner_tag.value_counts()

In [None]:
X = additional_features(X)

In [None]:
X.info()

In [None]:
X, y = X.token, X.ner_tag

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [None]:
estimator = MemoryTagger()

In [None]:
scores = cross_val_score(estimator, X_train, y_train, cv=5, verbose=True, n_jobs=-1)

In [None]:
scores

In [None]:
estimator.fit(X_train, y_train)

In [None]:
sklearn_report = sklearn_classification_report(y_pred=estimator.predict(X_test), y_true=y_test)
print(sklearn_report)

# Random Forest Classifier

In [None]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

In [None]:
X = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [None]:
X.ner_tag.value_counts()

In [None]:
X = filtrations(X)

In [None]:
X.ner_tag.value_counts()

In [None]:
X = additional_features(X)

In [None]:
X.info()

In [None]:
X, y = X[features], X[target]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [None]:
estimator = RandomForestClassifier()

In [None]:
scores = cross_val_score(estimator, X_train, y_train, cv=5, verbose=True, n_jobs=-1)

In [None]:
scores

In [None]:
estimator.fit(X_train, y_train)

In [None]:
report = sklearn_classification_report(y_pred=estimator.predict(X_test), y_true=y_test)
print(report)

# HMM

[This article](https://pdfs.semanticscholar.org/9528/4b31f27b9b8901fdc18554603610ebbc2752.pdf)  gives a full description of how to calculate parameters for Hidden Markov Model.

From [this article](https://www.digitalvidya.com/blog/inroduction-to-hidden-markov-models-using-python/) we have taken Viterbi algorithm.

But first let's read the data again.

In [20]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

HBox(children=(IntProgress(value=0, description='Read folders: '), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Filter folders: ', max=10000), HTML(value='')))




In [21]:
X = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [22]:
X.head(30)

Unnamed: 0,token,pos_tag,lemma,ner_tag,word_net_sense_number,verb_net_roles,semantic_relation,animacy_tag,super_tag,lambda_dsr
0,Former,JJ,former,O,1,[Topic],O,O,N/N,"lam(v1,lam(v2,merge(b1:drs([b1:[]:s1],[b1:[]:r..."
1,Ecuadorean,NNP,ecuadorean,O,0,[],=,Human,N/N,"lam(v1,lam(v2,merge(b1:drs([b1:[]:x1],[b1:[100..."
2,President,NNP,president,per-tit,1,[],=,Human,N/N,"lam(v1,lam(v2,merge(b1:drs([b1:[]:x1],[b1:[100..."
3,Lucio,NNP,lucio,per-nam,0,[],=,Human,N/N,"lam(v1,lam(v2,merge(b1:drs([b1:[]:x1],[b1:[100..."
4,Gutierrez,NNP,gutierrez,per-nam,0,[],O,Human,N,"lam(v1,b1:drs([],[b1:[1005]:named(v1,gutierrez..."
5,has,VBZ,have,O,0,[],O,O,(S[dcl]\NP)/(S[pt]\NP),"lam(v1,lam(v2,lam(v3,app(app(v1,v2),lam(v4,mer..."
6,formally,RB,formally,O,1,[],O,O,(S[dcl]\NP)\(S[dcl]\NP),"lam(v1,lam(v2,lam(v3,app(app(v1,v2),lam(v4,mer..."
7,requested,VBN,request,O,1,"[Topic,Agent]",O,O,(S[pt]\NP)/NP,"lam(v1,lam(v2,lam(v3,app(v2,lam(v4,app(v1,lam(..."
8,political,JJ,political,O,1,[Topic],O,O,N/N,"lam(v1,lam(v2,merge(b1:drs([b1:[]:s1],[b1:[]:r..."
9,asylum,NN,asylum,O,1,[],O,Non-concrete,N,"lam(v1,b1:drs([],[b1:[1010]:pred(v1,asylum,n,'..."


In [23]:
X.ner_tag.value_counts()

O          1032479
geo-nam      55480
org-nam      44646
per-nam      22931
gpe-nam      19685
tim-dow      11398
tim-dat      10929
per-tit       9672
per-fam       8098
[]            4064
tim-moy       3811
tim-yoc       3009
per-giv       2376
tim-clo        810
art-nam        789
eve-nam        514
nat-nam        280
tim-nam        132
eve-ord         63
per-ini         55
per-ord         35
org-leg         13
tim-dom          8
per-mid          1
art-add          1
Name: ner_tag, dtype: int64

Lots of redundant tag additions (like -nam, -dow).

We will work without them:

In [24]:
X = filtrations(X, with_dots=True)

HBox(children=(IntProgress(value=0, description='Punctuation without dots: ', max=1231279), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Target tags: ', max=780339), HTML(value='')))




HBox(children=(IntProgress(value=0, description='IOB to BIO: ', max=780339), HTML(value='')))




In [25]:
X.head(30)

Unnamed: 0,token,pos_tag,lemma,ner_tag,word_net_sense_number,verb_net_roles,semantic_relation,animacy_tag,super_tag,lambda_dsr
0,Former,JJ,former,O,1,[Topic],O,O,N/N,"lam(v1,lam(v2,merge(b1:drs([b1:[]:s1],[b1:[]:r..."
1,Ecuadorean,NNP,ecuadorean,O,0,[],=,Human,N/N,"lam(v1,lam(v2,merge(b1:drs([b1:[]:x1],[b1:[100..."
2,President,NNP,president,B-per,1,[],=,Human,N/N,"lam(v1,lam(v2,merge(b1:drs([b1:[]:x1],[b1:[100..."
3,Lucio,NNP,lucio,I-per,0,[],=,Human,N/N,"lam(v1,lam(v2,merge(b1:drs([b1:[]:x1],[b1:[100..."
4,Gutierrez,NNP,gutierrez,I-per,0,[],O,Human,N,"lam(v1,b1:drs([],[b1:[1005]:named(v1,gutierrez..."
6,formally,RB,formally,O,1,[],O,O,(S[dcl]\NP)\(S[dcl]\NP),"lam(v1,lam(v2,lam(v3,app(app(v1,v2),lam(v4,mer..."
7,requested,VBN,request,O,1,"[Topic,Agent]",O,O,(S[pt]\NP)/NP,"lam(v1,lam(v2,lam(v3,app(v2,lam(v4,app(v1,lam(..."
8,political,JJ,political,O,1,[Topic],O,O,N/N,"lam(v1,lam(v2,merge(b1:drs([b1:[]:s1],[b1:[]:r..."
9,asylum,NN,asylum,O,1,[],O,Non-concrete,N,"lam(v1,b1:drs([],[b1:[1010]:pred(v1,asylum,n,'..."
11,Colombia,NNP,colombia,B-geo,1,[],O,Place,N,"lam(v1,b1:drs([],[b1:[1012]:named(v1,colombia,..."


In [26]:
X.ner_tag.value_counts()

O        590495
B-geo     42533
B-org     23134
B-tim     22603
I-per     22211
B-per     20861
I-org     19331
B-gpe     19104
I-geo     12438
I-tim      5481
I-gpe       579
B-art       409
I-art       322
B-eve       310
I-eve       248
B-nat       208
I-nat        72
Name: ner_tag, dtype: int64

In [27]:
X.ner_tag.nunique()

17

In [28]:
_X = X.copy()

In [29]:
_X[_X.lemma == '.'] = '%'

In [46]:
'|'.join(_X.ner_tag.values).split('%')[0]

'O|O|B-per|I-per|I-per|O|O|O|O|B-geo|O|O|O|O|O|O|'

The approach proposed in the article mentioned above is divided into three steps:
- Data preparation;
- Parameter estimation (or training);
- Usage of parameters estimated (or testing).

## Step 1: Data preparation

In [None]:
X, y = SentenceExtractor().fit_transform(X)

In [None]:
lemma_sentence_lenghts = list(map(len, X))

In [None]:
tag_sentence_lenghts = list(map(len, y))

In [None]:
max(lemma_sentence_lenghts)

In [None]:
max(tag_sentence_lenghts)

In [None]:
all(len_lemmas == len_tags for len_lemmas, len_tags in zip(lemma_sentence_lenghts, tag_sentence_lenghts))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Step 2: HMM Parameter Estimation

### Step 2.1: Find states.

In [None]:
states = list(set([tag for sentence in y_train for tag in sentence]))
tag2idx = {t: i for i, t in enumerate(states)}
idx2tag = {i: w for w, i in tag2idx.items()}

In [None]:
len(states)

In [None]:
states

In [None]:
for i in range(len(y)):
    if len(y[i]) == 3:
        print('\t', X[i])
        print('\t', y[i])

### Step 2.2: Calculate Start probability (Ï€)

In [None]:
pi = np.zeros(len(states))

In [None]:
for sentence in y_train:
    if len(sentence) > 0:
        pi[tag2idx[sentence[0]]] += 1

In [None]:
pi = pi / len(y_train)

### Step 2.3: Calculate transition probability (A)

In [None]:
A = np.zeros((len(states), len(states)))
A_i = np.zeros(len(states))

In [None]:
for i in range(len(y_train)):
    if len(y_train[i]) > 1:
        for j in range(len(y_train[i]) - 1):
            A[tag2idx[y_train[i][j]]][tag2idx[y_train[i][j + 1]]] += 1
            A_i[tag2idx[y_train[i][j]]] += 1
        A_i[tag2idx[y_train[i][len(y_train[i]) - 1]]] += 1
for i in range(len(states)):
    A[i] = A[i] / A_i[i]

In [None]:
plt.figure(figsize=(20, 12))
sns.heatmap(A, annot=True, xticklabels=states, yticklabels=states, fmt='.2g')
plt.tight_layout()
plt.show()

### Step 2.4: Calculate emission probability (B)

In [None]:
words = list(set([word for sentence in X_train for word in sentence]))
word2idx = {t: i for i, t in enumerate(words)}
idx2word = {i: w for w, i in word2idx.items()}

In [None]:
len(words)

In [None]:
B = np.zeros((len(words), len(states)))
B_i = np.zeros(len(states))

In [None]:
for i in range(len(X_train)):
    if len(X_train[i]) > 0:
        for j in range(len(X_train[i])):
            try:
                B[
                    word2idx[
                        X_train[i][j]
                    ]
                ][
                    tag2idx[
                        y_train[i][j]
                    ]
                ] += 1
                B_i[tag2idx[y_train[i][j]]] += 1
            except:
                print('\t', i, j, len(X_train[i]), len(y_train[i]))
for i in range(len(states)):
    B[:, i] = B[:, i] / B_i[i]

In [None]:
X_train[23989]

In [None]:
y_train[23989]

## Step 3: Testing

# CRF

In [None]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

In [None]:
X = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [None]:
X.ner_tag.value_counts()

In [None]:
X = filtrations(X, with_dots=True)

In [None]:
X.ner_tag.value_counts()

In [None]:
X = additional_features(X)

In [None]:
X.info()

In [None]:
X, y = crf_filtration_and_pre_processing(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
estimator = sklearn_crf(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False)

In [None]:
scores = cross_val_score(estimator, X_train, y_train, cv=5, verbose=True, n_jobs=-1)

In [None]:
scores

In [None]:
estimator.fit(X_train, y_train)

In [None]:
report = crfsuite_classification_report(y_pred=estimator.predict(X_test), y_true=y_test)
print(report)

# Bi-LSTM

In [None]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

In [None]:
X = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [None]:
X.head(30)

In [None]:
X.ner_tag.value_counts()

In [None]:
X = filtrations(X, with_dots=True)

In [None]:
X.ner_tag.value_counts()

In [None]:
X.ner_tag.nunique()

In [None]:
X, y = SentenceExtractor().fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
estimator = BiLSTMTagger(checkpoint_dir='../data/datasets/keras_model/')

In [None]:
estimator.fit(X_train, y_train)

In [None]:
y_pred = estimator.predict(X_test)

In [None]:
y_test = [[estimator.tag2idx[w] for w in s] for s in y_test]
y_test = pad_sequences(maxlen=75, sequences=y_test, padding="post", value=estimator.tag2idx["O"])
y_test = [[estimator.idx2tag[w] for w in s] for s in y_test]

In [None]:
crfsuite_report = crfsuite_classification_report(y_pred=y_pred, y_true=y_test)
print(crfsuite_report)

In [None]:
seqeval_report = seqeval_classification_report(y_pred=y_pred, y_true=y_test)
print(seqeval_report)

# Conclusion