# All imports necessary

In [None]:
import sys

In [None]:
sys.path.append('..')

In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
from source.code.utils import filter_by_subcorpus
from source.code.utils import get_tagged_texts_as_pd

In [None]:
from source.code.preprocessing import filtrations
from source.code.preprocessing import additional_features
from source.code.preprocessing import crf_filtration_and_pre_processing

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

In [None]:
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn_crfsuite import metrics

# Read the data

In [None]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

In [None]:
tagged_texts_as_pd = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [None]:
tagged_texts_as_pd_f = filtrations(tagged_texts_as_pd)

In [None]:
tagged_texts_as_pd_f_add_f = additional_features(tagged_texts_as_pd_f)

In [None]:
tagged_texts_as_pd_f_add_f.info()

In [None]:
features = ['semantic_relation_tagged', 'animacy_tagged', 'lambda_dsr_len', 'word_sense_exists', 'is_title', 'contains_digits', 'word_len']
target = 'ner_tag'

# Naive tag frequency memorization

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class MemoryTagger(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y):
        '''
        Expects a list of words as X and a list of tags as y.
        '''
        voc = {}
        self.tags = []
        for x, t in zip(X, y):
            if t not in self.tags:
                self.tags.append(t)
            if x in voc:
                if t in voc[x]:
                    voc[x][t] += 1
                else:
                    voc[x][t] = 1
            else:
                voc[x] = {t: 1}
        self.memory = {}
        for k, d in voc.items():
            self.memory[k] = max(d, key=d.get)
    
    def predict(self, X, y=None):
        '''
        Predict the the tag from memory. If word is unknown, predict 'O'.
        '''
        return [self.memory.get(x, 'O') for x in X]

In [None]:
X, y = tagged_texts_as_pd_f_add_f.token, tagged_texts_as_pd_f_add_f.ner_tag

In [None]:
pred = cross_val_predict(estimator=MemoryTagger(), X=X, y=y, cv=5)

In [None]:
report = classification_report(y_pred=pred, y_true=y)
print(report)

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
X, y = tagged_texts_as_pd_f_add_f[features], tagged_texts_as_pd_f_add_f[target]

In [None]:
pred = cross_val_predict(RandomForestClassifier(n_estimators=20), X=X, y=y, cv=5)

In [None]:
report = classification_report(y_pred=pred, y_true=y)
print(report)

# HMM

# CRF

In [None]:
from sklearn_crfsuite import CRF

In [None]:
crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False)

In [None]:
tagged_texts_as_pd_f = filtrations(tagged_texts_as_pd, with_dots=True)

In [None]:
tagged_texts_as_pd_add_f = additional_features(tagged_texts_as_pd_f)

In [None]:
sentences, tags = crf_filtration_and_pre_processing(tagged_texts_as_pd_add_f)

In [None]:
pred = cross_val_predict(crf, X=sentences, y=tags, cv=5)

In [None]:
report = flat_classification_report(y_pred=pred, y_true=tags)
print(report)

# Bi-LSTM

In [None]:
tagged_texts_as_pd_f = filtrations(tagged_texts_as_pd, with_dots=True)

In [None]:
tagged_texts_as_pd_add_f = additional_features(tagged_texts_as_pd_f)

In [None]:
sentences, tags = crf_filtration_and_pre_processing(tagged_texts_as_pd_add_f)

In [None]:
words = list(set([word['lemma'] for sentence in sentences for word in sentence]))
words.append("ENDPAD")
n_words = len(words); n_words

In [None]:
unique_tags = tagged_texts_as_pd_f.ner_tag.unique()
n_tags = len(unique_tags); n_tags

In [None]:
max_len = 75
word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(unique_tags)}

In [None]:
from keras.preprocessing.sequence import pad_sequences
X = [[word2idx[w['lemma']] for w in s] for s in sentences]

In [None]:
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words-1)

In [None]:
y = [[tag2idx[w] for w in s] for s in tags]

In [None]:
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

In [None]:
from keras.utils import to_categorical

In [None]:
y = [to_categorical(i, num_classes=n_tags) for i in y]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)

In [None]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF

In [None]:
input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words + 1, output_dim=20,
                  input_length=max_len, mask_zero=True)(input)  # 20-dim embedding
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model)  # variational biLSTM
model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
crf = CRF(n_tags)  # CRF layer
out = crf(model)  # output

In [None]:
model = Model(input, out)

In [None]:
model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])

In [None]:
model.summary()

In [None]:
history = model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=5, validation_split=0.1, verbose=1)

In [None]:
hist = pd.DataFrame(history.history)

In [None]:
import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.figure(figsize=(12,12))
plt.plot(hist["acc"])
plt.plot(hist["val_acc"])
plt.show()

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

In [None]:
test_pred = model.predict(X_te, verbose=1)

In [None]:
idx2tag = {i: w for w, i in tag2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_te)

In [None]:
print(flat_classification_report(pred_labels, test_labels))

In [None]:
class BiLSTMTagger(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y):
        
    
    def predict(self, X, y=None):
        

# Conclusion