# All imports necessary

In [1]:
import sys

In [2]:
sys.path.append('..')

In [3]:
import os
import numpy as np
import pandas as pd

In [5]:
from source.code.utils import filter_by_subcorpus
from source.code.utils import get_tagged_texts_as_pd

In [6]:
from source.code.preprocessing import filtrations
from source.code.preprocessing import additional_features
from source.code.preprocessing import crf_filtration_and_pre_processing

In [32]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer

In [8]:
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn_crfsuite import metrics

# Read the data

In [9]:
target_subcorpus_folders = filter_by_subcorpus('../data/datasets/gmb-2.2.0/', 'subcorpus: Voice of America')

HBox(children=(IntProgress(value=0, description='Read folders: '), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Filter folders: ', max=10000), HTML(value='')))




In [10]:
tagged_texts_as_pd = get_tagged_texts_as_pd(target_subcorpus_folders, '../data/datasets/gmb-2.2.0/')

In [35]:
tagged_texts_as_pd_f = filtrations(tagged_texts_as_pd)

HBox(children=(IntProgress(value=0, description='Punctuation: ', max=1231279), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Target tags: ', max=1130555), HTML(value='')))




In [36]:
tagged_texts_as_pd_f_add_f = additional_features(tagged_texts_as_pd_f)

HBox(children=(IntProgress(value=0, description='NER tagged: ', max=1130555), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Semantic relation: ', max=1130555), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Animacy tagged: ', max=1130555), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Lambda-DSR len: ', max=1130555), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Word sense: ', max=1130555), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Is title: ', max=1130555), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Contains digits: ', max=1130555), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Word len: ', max=1130555), HTML(value='')))




In [37]:
tagged_texts_as_pd_f_add_f.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1130555 entries, 0 to 1231277
Data columns (total 18 columns):
token                       1130555 non-null object
pos_tag                     1130555 non-null object
lemma                       1130555 non-null object
ner_tag                     1130555 non-null object
word_net_sense_number       1130555 non-null int64
verb_net_roles              1130555 non-null object
semantic_relation           1130555 non-null object
animacy_tag                 1130555 non-null object
super_tag                   1130555 non-null object
lambda_dsr                  1130555 non-null object
ner_tagged                  1130555 non-null int64
semantic_relation_tagged    1130555 non-null int64
animacy_tagged              1130555 non-null int64
lambda_dsr_len              1130555 non-null int64
word_sense_exists           1130555 non-null int64
is_title                    1130555 non-null int64
contains_digits             1130555 non-null int64
word_len   

In [38]:
features = ['semantic_relation_tagged', 'animacy_tagged', 'lambda_dsr_len', 'word_sense_exists', 'is_title', 'contains_digits', 'word_len']
target = 'ner_tag'

# Naive tag frequency memorization

In [39]:
from sklearn.base import BaseEstimator, TransformerMixin

class MemoryTagger(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y):
        '''
        Expects a list of words as X and a list of tags as y.
        '''
        voc = {}
        self.tags = []
        for x, t in zip(X, y):
            if t not in self.tags:
                self.tags.append(t)
            if x in voc:
                if t in voc[x]:
                    voc[x][t] += 1
                else:
                    voc[x][t] = 1
            else:
                voc[x] = {t: 1}
        self.memory = {}
        for k, d in voc.items():
            self.memory[k] = max(d, key=d.get)
    
    def predict(self, X, y=None):
        '''
        Predict the the tag from memory. If word is unknown, predict 'O'.
        '''
        return [self.memory.get(x, 'O') for x in X]

In [40]:
X, y = tagged_texts_as_pd_f_add_f.token, tagged_texts_as_pd_f_add_f.ner_tag

In [42]:
pred = cross_val_predict(estimator=MemoryTagger(), X=X, y=y, cv=5)

In [43]:
report = classification_report(y_pred=pred, y_true=y)
print(report)

              precision    recall  f1-score   support

           O       0.97      0.99      0.98    936477
         art       0.30      0.10      0.15       787
         eve       0.53      0.20      0.29       575
         geo       0.82      0.85      0.83     55427
         gpe       0.95      0.94      0.95     19685
         nat       0.53      0.40      0.45       280
         org       0.76      0.60      0.67     44407
         per       0.87      0.80      0.84     43118
         tim       0.94      0.71      0.81     29799

   micro avg       0.95      0.95      0.95   1130555
   macro avg       0.74      0.62      0.66   1130555
weighted avg       0.95      0.95      0.95   1130555



# Random Forest Classifier

In [44]:
from sklearn.ensemble import RandomForestClassifier

In [45]:
X, y = tagged_texts_as_pd_f_add_f[features], tagged_texts_as_pd_f_add_f[target]

In [48]:
pred = cross_val_predict(RandomForestClassifier(n_estimators=20), X=X, y=y, cv=5)

In [49]:
report = classification_report(y_pred=pred, y_true=y)
print(report)

              precision    recall  f1-score   support

           O       1.00      0.99      0.99    936477
         art       0.14      0.01      0.02       787
         eve       0.30      0.02      0.03       575
         geo       0.56      0.73      0.63     55427
         gpe       0.99      0.92      0.95     19685
         nat       0.13      0.01      0.02       280
         org       0.63      0.46      0.53     44407
         per       0.62      0.78      0.69     43118
         tim       0.91      0.63      0.75     29799

   micro avg       0.94      0.94      0.94   1130555
   macro avg       0.59      0.50      0.51   1130555
weighted avg       0.94      0.94      0.94   1130555



# HMM

# CRF

In [11]:
from sklearn_crfsuite import CRF

In [12]:
crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=False)

In [21]:
tagged_texts_as_pd_add_f = additional_features(tagged_texts_as_pd)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


HBox(children=(IntProgress(value=0, description='NER tagged: ', max=1227215), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['ner_tagged'] = df.ner_tag.progress_apply(lambda x: int(str(x) != 'O'))


HBox(children=(IntProgress(value=0, description='Semantic relation: ', max=1227215), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['semantic_relation_tagged'] = df.semantic_relation.progress_apply(lambda x: int(str(x) != 'O'))


HBox(children=(IntProgress(value=0, description='Animacy tagged: ', max=1227215), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['animacy_tagged'] = df.animacy_tag.progress_apply(lambda x: int(str(x) != 'O'))


HBox(children=(IntProgress(value=0, description='Lambda-DSR len: ', max=1227215), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['lambda_dsr_len'] = df.lambda_dsr.progress_apply(lambda x: len(str(x)))


HBox(children=(IntProgress(value=0, description='Word sense: ', max=1227215), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['word_sense_exists'] = df.word_net_sense_number.progress_apply(lambda x: int(int(x) > 0))


HBox(children=(IntProgress(value=0, description='Is title: ', max=1227215), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['is_title'] = df.token.progress_apply(lambda x: int(str(x).istitle()))


HBox(children=(IntProgress(value=0, description='Contains digits: ', max=1227215), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['contains_digits'] = df.token.progress_apply(lambda x: int(not str(x).isalpha()))


HBox(children=(IntProgress(value=0, description='Word len: ', max=1227215), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['word_len'] = df.token.progress_apply(lambda x: len(str(x)))


In [22]:
sentences, tags = crf_filtration_and_pre_processing(tagged_texts_as_pd_add_f)

100%|██████████| 1227214/1227214 [27:41<00:00, 738.60it/s]


In [33]:
pred = cross_val_predict(crf, X=sentences, y=tags, cv=5)

In [34]:
report = flat_classification_report(y_pred=pred, y_true=tags)
print(report)

              precision    recall  f1-score   support

           O       1.00      1.00      1.00    936583
         art       0.69      0.44      0.54       787
         eve       0.66      0.51      0.58       575
         geo       0.96      0.97      0.96     55427
         gpe       0.99      0.96      0.98     19685
         nat       0.96      0.90      0.93       280
         org       0.94      0.94      0.94     44407
         per       0.97      0.98      0.97     43118
         tim       0.98      0.98      0.98     29799

   micro avg       0.99      0.99      0.99   1130661
   macro avg       0.91      0.85      0.88   1130661
weighted avg       0.99      0.99      0.99   1130661



# Bi-LSTM

# Conclusion