# All imports necessary

In [1]:
import sys

In [2]:
sys.path.append('..')

In [3]:
import os
import numpy as np
import pandas as pd

In [5]:
from source.code.utils import filter_by_subcorpus
from source.code.utils import get_tagged_texts_as_pd

In [6]:
from source.code.preprocessing import filtrations
from source.code.preprocessing import additional_features
from source.code.preprocessing import crf_filtration_and_pre_processing

In [7]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

In [8]:
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn_crfsuite import metrics

# Read the data

In [9]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

HBox(children=(IntProgress(value=0, description='Read folders: '), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Filter folders: ', max=10000), HTML(value='')))




In [10]:
tagged_texts_as_pd = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [14]:
tagged_texts_as_pd_f = filtrations(tagged_texts_as_pd)

HBox(children=(IntProgress(value=0, description='Punctuation: ', max=1231279), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Target tags: ', max=1130555), HTML(value='')))




In [None]:
tagged_texts_as_pd_f_add_f = additional_features(tagged_texts_as_pd_f)

In [None]:
tagged_texts_as_pd_f_add_f.info()

In [None]:
features = ['semantic_relation_tagged', 'animacy_tagged', 'lambda_dsr_len', 'word_sense_exists', 'is_title', 'contains_digits', 'word_len']
target = 'ner_tag'

# Naive tag frequency memorization

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class MemoryTagger(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y):
        '''
        Expects a list of words as X and a list of tags as y.
        '''
        voc = {}
        self.tags = []
        for x, t in zip(X, y):
            if t not in self.tags:
                self.tags.append(t)
            if x in voc:
                if t in voc[x]:
                    voc[x][t] += 1
                else:
                    voc[x][t] = 1
            else:
                voc[x] = {t: 1}
        self.memory = {}
        for k, d in voc.items():
            self.memory[k] = max(d, key=d.get)
    
    def predict(self, X, y=None):
        '''
        Predict the the tag from memory. If word is unknown, predict 'O'.
        '''
        return [self.memory.get(x, 'O') for x in X]

In [None]:
X, y = tagged_texts_as_pd_f_add_f.token, tagged_texts_as_pd_f_add_f.ner_tag

In [None]:
pred = cross_val_predict(estimator=MemoryTagger(), X=X, y=y, cv=5)

In [None]:
report = classification_report(y_pred=pred, y_true=y)
print(report)

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
X, y = tagged_texts_as_pd_f_add_f[features], tagged_texts_as_pd_f_add_f[target]

In [None]:
pred = cross_val_predict(RandomForestClassifier(n_estimators=20), X=X, y=y, cv=5)

In [None]:
report = classification_report(y_pred=pred, y_true=y)
print(report)

# HMM

# CRF

In [15]:
from sklearn_crfsuite import CRF

In [16]:
crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False)

In [17]:
tagged_texts_as_pd_f = filtrations(tagged_texts_as_pd, with_dots=True)

HBox(children=(IntProgress(value=0, description='Punctuation: ', max=1231279), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Target tags: ', max=1187294), HTML(value='')))




In [18]:
tagged_texts_as_pd_add_f = additional_features(tagged_texts_as_pd_f)

HBox(children=(IntProgress(value=0, description='NER tagged: ', max=1187294), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Semantic relation: ', max=1187294), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Animacy tagged: ', max=1187294), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Lambda-DSR len: ', max=1187294), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Word sense: ', max=1187294), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Is title: ', max=1187294), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Contains digits: ', max=1187294), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Word len: ', max=1187294), HTML(value='')))




In [30]:
sentences, tags = crf_filtration_and_pre_processing(tagged_texts_as_pd_add_f)

100%|██████████| 1187293/1187293 [27:04<00:00, 730.83it/s]


In [None]:
pred = cross_val_predict(crf, X=sentences, y=tags, cv=5)

In [None]:
report = flat_classification_report(y_pred=pred, y_true=tags)
print(report)

# Bi-LSTM

In [21]:
words = list(set([word['lemma'] for sentence in sentences for word in sentence]))
words.append("ENDPAD")
n_words = len(words); n_words

27193

In [24]:
tags = tagged_texts_as_pd_f.ner_tag.unique()
n_tags = len(tags); n_tags

9

In [25]:
max_len = 75
word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [28]:
from keras.preprocessing.sequence import pad_sequences
X = [[word2idx[w['lemma']] for w in s] for s in sentences]

In [29]:
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words-1)

In [None]:
y = [[tag2idx[w[2]] for w in s] for s in sentences]

In [None]:
class BiLSTMTagger(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y):
        
    
    def predict(self, X, y=None):
        

# Conclusion