# All imports necessary

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import sys

In [3]:
sys.path.append('..')

In [4]:
import os
import numpy as np
import pandas as pd

In [6]:
from source.code.utils.utils import filter_by_subcorpus
from source.code.utils.utils import get_tagged_texts_as_pd

from source.code.models.memorytagger import MemoryTagger
from source.code.models.bilstmtagger import BiLSTMTagger

from source.code.utils.preprocessing import filtrations
from source.code.utils.preprocessing import additional_features
from source.code.utils.preprocessing import crf_pre_processing

In [7]:
from sklearn_crfsuite.metrics import flat_classification_report

from sklearn_crfsuite import metrics

from sklearn_crfsuite import CRF as sklearn_crf

In [8]:
from sklearn.metrics import classification_report

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score

In [9]:
from keras.preprocessing.sequence import pad_sequences

In [10]:
features = ['semantic_relation_tagged', 'animacy_tagged', 'lambda_dsr_len', 'word_sense_exists', 'is_title', 'contains_digits', 'word_len']
target = 'ner_tag'

In [17]:
pd.set_option('display.max_rows', 30000)

# Naive tag frequency memorization

In [None]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

In [None]:
data = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [None]:
data.ner_tag.value_counts()

In [None]:
data = filtrations(data)

In [None]:
data.ner_tag.value_counts()

In [None]:
data = additional_features(data)

In [None]:
data.info()

In [None]:
X, y = data.token, data.ner_tag

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [None]:
estimator = MemoryTagger()

In [None]:
scores = cross_val_score(estimator, X_train, y_train, cv=5, verbose=True, n_jobs=-1)

In [None]:
scores

In [None]:
estimator.fit(X_train, y_train)

In [None]:
report = classification_report(y_pred=estimator.predict(X_test), y_true=y_test)
print(report)

# Random Forest Classifier

In [None]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

In [None]:
data = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [None]:
data.ner_tag.value_counts()

In [None]:
data = filtrations(data)

In [None]:
data.ner_tag.value_counts()

In [None]:
data = additional_features(data)

In [None]:
data.info()

In [None]:
X, y = data[features], data[target]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [None]:
estimator = RandomForestClassifier()

In [None]:
scores = cross_val_score(estimator, X_train, y_train, cv=5, verbose=True, n_jobs=-1)

In [None]:
scores

In [None]:
estimator.fit(X_train, y_train)

In [None]:
report = classification_report(y_pred=estimator.predict(X_test), y_true=y_test)
print(report)

# HMM

TBD...

# CRF

In [None]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

In [None]:
data = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [None]:
data.ner_tag.value_counts()

In [None]:
data = filtrations(data, with_dots=True)

In [None]:
data.ner_tag.value_counts()

In [None]:
data = additional_features(data)

In [None]:
data.info()

In [None]:
X, y = crf_filtration_and_pre_processing(data)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
estimator = sklearn_crf(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False)

In [None]:
scores = cross_val_score(estimator, X_train, y_train, cv=5, verbose=True, n_jobs=-1)

In [None]:
scores

In [None]:
estimator.fit(X_train, y_train)

In [None]:
report = flat_classification_report(y_pred=estimator.predict(X_test), y_true=y_test)
print(report)

# Bi-LSTM

In [11]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

HBox(children=(IntProgress(value=0, description='Read folders: '), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Filter folders: ', max=10000), HTML(value='')))




In [12]:
data = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [22]:
data.head(30)

Unnamed: 0,token,pos_tag,lemma,ner_tag,word_net_sense_number,verb_net_roles,semantic_relation,animacy_tag,super_tag,lambda_dsr
0,Thousands,NNS,thousand,O,1,[],O,Human,N,"lam(v1,b1:drs([],[b1:[1001]:pred(v1,thousand,n..."
1,of,IN,of,O,0,[],O,O,(NP\NP)/NP,"lam(v1,lam(v2,lam(v3,app(v2,lam(v4,app(v1,lam(..."
2,demonstrators,NNS,demonstrator,O,1,[],O,Human,N,"lam(v1,b1:drs([],[b1:[1003]:pred(v1,demonstrat..."
3,have,VBP,have,O,0,[],O,O,(S[dcl]\NP)/(S[pt]\NP),"lam(v1,lam(v2,lam(v3,app(app(v1,v2),lam(v4,mer..."
4,marched,VBN,march,O,1,[Theme],O,O,S[pt]\NP,"lam(v1,lam(v2,app(v1,lam(v3,merge(b1:drs([b1:[..."
5,through,IN,through,O,0,[],O,O,((S[pt]\NP)\(S[pt]\NP))/NP,"lam(v1,lam(v2,lam(v3,lam(v4,app(app(v2,v3),lam..."
6,London,NNP,london,geo-nam,1,[],O,Place,N,"lam(v1,b1:drs([],[b1:[1007]:named(v1,london,ge..."
7,to,TO,to,O,0,[],O,O,(S[to]\NP)/(S[b]\NP),"lam(v1,lam(v2,lam(v3,merge(b1:drs([b1:[1008]:p..."
8,protest,VB,protest,O,1,"[Theme,Agent]",O,O,(S[b]\NP)/NP,"lam(v1,lam(v2,lam(v3,app(v2,lam(v4,app(v1,lam(..."
9,the,DT,the,O,0,[],O,O,NP/N,"lam(v1,lam(v2,alfa(def,merge(b1:drs([b1:[1010]..."


In [21]:
data.ner_tag.value_counts()

O          1032479
geo-nam      55480
org-nam      44646
per-nam      22931
gpe-nam      19685
tim-dow      11398
tim-dat      10929
per-tit       9672
per-fam       8098
[]            4064
tim-moy       3811
tim-yoc       3009
per-giv       2376
tim-clo        810
art-nam        789
eve-nam        514
nat-nam        280
tim-nam        132
eve-ord         63
per-ini         55
per-ord         35
org-leg         13
tim-dom          8
per-mid          1
art-add          1
Name: ner_tag, dtype: int64

In [None]:
data = filtrations(data, with_dots=True)

In [None]:
data.ner_tag.value_counts()

In [None]:
data.info()

In [None]:
X, y = crf_filtration_and_pre_processing(data)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
estimator = BiLSTMTagger(checkpoint_dir='../data/datasets/keras_model/')

In [None]:
estimator.fit(X_train, y_train)

In [None]:
report = flat_classification_report(y_pred=estimator.predict(X_test), y_true=y_test)
print(report)

# Conclusion