# All imports necessary

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import sys

In [None]:
sys.path.append('..')

In [None]:
import os

In [None]:
import numpy as np

In [None]:
import pandas as pd

In [None]:
from source.code.utils.utils import filter_by_subcorpus
from source.code.utils.utils import get_tagged_texts_as_pd

from source.code.transformers.sentenceextractor import SentenceExtractor
from source.code.models.memorytagger import MemoryTagger
from source.code.models.bilstmtagger import BiLSTMTagger

from source.code.utils.preprocessing import filtrations
from source.code.utils.preprocessing import additional_features
from source.code.utils.preprocessing import crf_pre_processing

In [None]:
from sklearn_crfsuite.metrics import flat_classification_report as crfsuite_classification_report

from sklearn_crfsuite import metrics

from sklearn_crfsuite import CRF as sklearn_crf

In [None]:
from seqeval.metrics import classification_report as seqeval_classification_report

In [None]:
from sklearn.metrics import classification_report

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score

In [None]:
from keras.preprocessing.sequence import pad_sequences

In [None]:
features = ['semantic_relation_tagged', 'animacy_tagged', 'lambda_dsr_len', 'word_sense_exists', 'is_title', 'contains_digits', 'word_len']
target = 'ner_tag'

In [None]:
pd.set_option('display.max_rows', 30000)

# Naive tag frequency memorization

In [None]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

In [None]:
X = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [None]:
X.ner_tag.value_counts()

In [None]:
X = filtrations(X)

In [None]:
X.ner_tag.value_counts()

In [None]:
X = additional_features(X)

In [None]:
X.info()

In [None]:
X, y = X.token, X.ner_tag

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [None]:
estimator = MemoryTagger()

In [None]:
scores = cross_val_score(estimator, X_train, y_train, cv=5, verbose=True, n_jobs=-1)

In [None]:
scores

In [None]:
estimator.fit(X_train, y_train)

In [None]:
report = classification_report(y_pred=estimator.predict(X_test), y_true=y_test)
print(report)

# Random Forest Classifier

In [None]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

In [None]:
X = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [None]:
X.ner_tag.value_counts()

In [None]:
X = filtrations(X)

In [None]:
X.ner_tag.value_counts()

In [None]:
X = additional_features(X)

In [None]:
X.info()

In [None]:
X, y = X[features], X[target]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [None]:
estimator = RandomForestClassifier()

In [None]:
scores = cross_val_score(estimator, X_train, y_train, cv=5, verbose=True, n_jobs=-1)

In [None]:
scores

In [None]:
estimator.fit(X_train, y_train)

In [None]:
report = classification_report(y_pred=estimator.predict(X_test), y_true=y_test)
print(report)

# HMM

TBD...

# CRF

In [None]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

In [None]:
X = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [None]:
X.ner_tag.value_counts()

In [None]:
X = filtrations(X, with_dots=True)

In [None]:
X.ner_tag.value_counts()

In [None]:
X = additional_features(X)

In [None]:
X.info()

In [None]:
X, y = crf_filtration_and_pre_processing(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
estimator = sklearn_crf(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False)

In [None]:
scores = cross_val_score(estimator, X_train, y_train, cv=5, verbose=True, n_jobs=-1)

In [None]:
scores

In [None]:
estimator.fit(X_train, y_train)

In [None]:
report = flat_classification_report(y_pred=estimator.predict(X_test), y_true=y_test)
print(report)

# Bi-LSTM

In [None]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

In [None]:
X = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [None]:
X.head(30)

In [None]:
X.ner_tag.value_counts()

In [None]:
X = filtrations(X, with_dots=True)

In [None]:
X.ner_tag.value_counts()

In [None]:
X.ner_tag.nunique()

In [None]:
X, y = SentenceExtractor().fit_transform(X)

In [None]:
n_tags = 17

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
estimator = BiLSTMTagger(checkpoint_dir='../data/datasets/keras_model/')

In [None]:
estimator.fit(X_train, y_train)

In [None]:
y_pred = estimator.predict(X_test)

In [None]:
y_test = [[estimator.tag2idx[w] for w in s] for s in y_test]
y_test = pad_sequences(maxlen=75, sequences=y_test, padding="post", value=estimator.tag2idx["O"])
y_test = [[estimator.idx2tag[w] for w in s] for s in y_test]

In [None]:
crfsuite_report = crfsuite_classification_report(y_pred=y_pred, y_true=y_test)
print(crfsuite_report)

In [None]:
seqeval_report = seqeval_classification_report(y_pred=y_pred, y_true=y_test)
print(seqeval_report)

# Conclusion