In [21]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import pycrfsuite


# Nano NER System 
This is an NER System for all things Nano built with python-crfsuite. We start by downloading a corpus. Lets ude the conll_2002 ner corpus availible from nltk.

In [22]:
%%time
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))

CPU times: user 2.74 s, sys: 122 ms, total: 2.86 s
Wall time: 2.87 s


Data Format:

In [23]:
train_sents[0]


[('Melbourne', 'NP', 'B-LOC'),
 ('(', 'Fpa', 'O'),
 ('Australia', 'NP', 'B-LOC'),
 (')', 'Fpt', 'O'),
 (',', 'Fc', 'O'),
 ('25', 'Z', 'O'),
 ('may', 'NC', 'O'),
 ('(', 'Fpa', 'O'),
 ('EFE', 'NC', 'B-ORG'),
 (')', 'Fpt', 'O'),
 ('.', 'Fp', 'O')]

## Features
Next, define some features. In this example we use word identity, word suffix, word shape and word POS tag; also, some information from nearby words is used.

In [24]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

This is what ane example feature would look like:

In [25]:
sent2features(test_sents[0])[0]

['bias',
 'word.lower=la',
 'word[-3:]=La',
 'word[-2:]=La',
 'word.isupper=False',
 'word.istitle=True',
 'word.isdigit=False',
 'postag=DA',
 'postag[:2]=DA',
 'BOS',
 '+1:word.lower=coruña',
 '+1:word.istitle=True',
 '+1:word.isupper=False',
 '+1:postag=NC',
 '+1:postag[:2]=NC']

In [26]:
# Next We Extract the Features

In [28]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

CPU times: user 3.02 s, sys: 188 ms, total: 3.21 s
Wall time: 3.21 s


## Train the model
To train the model, we create pycrfsuite.Trainer, load the training data and call 'train' method. First, create pycrfsuite.Trainer and load the training data to CRFsuite:

In [34]:
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

In [34]:
trainer.set_params({
    'c1': 0.1,   # coefficient for L1 penalty
    'c2': 0.1,  # coefficient for L2 penalty
    'max_iterations': 100,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [36]:
%%time
trainer.train('conll2002-esp.crfsuite')

CPU times: user 39 s, sys: 225 ms, total: 39.2 s
Wall time: 39.3 s


In [37]:
tagger = pycrfsuite.Tagger()
tagger.open('conll2002-esp.crfsuite')

<contextlib.closing at 0x11d6784e0>

In [38]:
example_sent = test_sents[0]
print(' '.join(sent2tokens(example_sent)), end='\n\n')

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(sent2labels(example_sent)))

La Coruña , 23 may ( EFECOM ) .

Predicted: B-LOC I-LOC O O O O B-ORG O O
Correct:   B-LOC I-LOC O O O O B-ORG O O


In [39]:
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [40]:
%%time
y_pred = [tagger.tag(xseq) for xseq in X_test]

CPU times: user 502 ms, sys: 3.53 ms, total: 505 ms
Wall time: 506 ms


In [41]:
print(bio_classification_report(y_test, y_pred))


             precision    recall  f1-score   support

      B-LOC       0.81      0.79      0.80      1084
      I-LOC       0.68      0.64      0.66       325
     B-MISC       0.71      0.55      0.62       339
     I-MISC       0.67      0.59      0.63       557
      B-ORG       0.81      0.83      0.82      1400
      I-ORG       0.85      0.78      0.81      1104
      B-PER       0.85      0.89      0.87       735
      I-PER       0.89      0.94      0.92       634

avg / total       0.81      0.78      0.79      6178

