In [1]:
import os

import sklearn
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers, metrics

def read_ner_data(data_dir:str, file_name:str):
    with open(os.path.join(data_dir, file_name), mode = 'r', encoding= 'utf-8') as f:
        data = [line.strip().splitlines() for line in f.read().split('\n=====\n') if line.strip()]
        data = [[tuple(tk.split('\t')) for tk in line] for line in data]
    return data

data_dir = os.path.join(os.getcwd(), 'data')
print (data_dir)

/home/jovyan/data


In [2]:
%time train_sents = list(read_ner_data(data_dir, 'train.txt'))
print ('n_Train Data set: %d\n'%len(train_sents))

%time test_sents = list(read_ner_data(data_dir, 'test.txt'))
print ('n_Test Data set: %d\n'%len(test_sents))

CPU times: user 168 ms, sys: 7.69 ms, total: 175 ms
Wall time: 384 ms
n_Train Data set: 3360

CPU times: user 11 ms, sys: 3.9 ms, total: 14.9 ms
Wall time: 15.2 ms
n_Test Data set: 300



In [6]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [7]:
from pprint import pprint
i = 5
pprint (train_sents[0][i])

print ('\n========================\n')

pprint (sent2features(train_sents[0])[i])

('받', 'VV', 'O')


{'+1:postag': 'ETM',
 '+1:postag[:2]': 'ET',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:word.lower()': '는',
 '-1:postag': 'NNB',
 '-1:postag[:2]': 'NN',
 '-1:word.istitle()': False,
 '-1:word.isupper()': False,
 '-1:word.lower()': '원',
 'bias': 1.0,
 'postag': 'VV',
 'postag[:2]': 'VV',
 'word.isdigit()': False,
 'word.istitle()': False,
 'word.isupper()': False,
 'word.lower()': '받',
 'word[-2:]': '받',
 'word[-3:]': '받'}


In [8]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

CPU times: user 636 ms, sys: 47.9 ms, total: 684 ms
Wall time: 1.42 s


In [16]:
X_test

[[{'bias': 1.0,
   'word.lower()': '그날',
   'word[-3:]': '그날',
   'word[-2:]': '그날',
   'word.isupper()': False,
   'word.istitle()': False,
   'word.isdigit()': False,
   'postag': 'NNG',
   'postag[:2]': 'NN',
   'BOS': True,
   '+1:word.lower()': '따라',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:postag': 'JX',
   '+1:postag[:2]': 'JX'},
  {'bias': 1.0,
   'word.lower()': '따라',
   'word[-3:]': '따라',
   'word[-2:]': '따라',
   'word.isupper()': False,
   'word.istitle()': False,
   'word.isdigit()': False,
   'postag': 'JX',
   'postag[:2]': 'JX',
   '-1:word.lower()': '그날',
   '-1:word.istitle()': False,
   '-1:word.isupper()': False,
   '-1:postag': 'NNG',
   '-1:postag[:2]': 'NN',
   '+1:word.lower()': '국장',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:postag': 'NNG',
   '+1:postag[:2]': 'NN'},
  {'bias': 1.0,
   'word.lower()': '국장',
   'word[-3:]': '국장',
   'word[-2:]': '국장',
   'word.isupper()': False,
   'word.istitle()': False,
  

In [9]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.05, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 33.6 s, sys: 45 ms, total: 33.7 s
Wall time: 1min 7s




CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.05,
    keep_tempfiles=None, max_iterations=100)

In [10]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-DT',
 'I-DT',
 'B-OG',
 'I-OG',
 'B-PS',
 'I-PS',
 'B-LC',
 'I-LC',
 'B-TI',
 'I-TI']

In [11]:
y_pred = crf.predict(X_test)

f1_score = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)
print ('F1 score: %0.3f'%f1_score)

F1 score: 0.839


In [12]:
# group B and I results
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)

print (metrics.flat_classification_report(
    y_test, 
    y_pred, 
    labels=sorted_labels, 
    digits=3
    ))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        B-DT      0.897     0.701     0.787        87
        I-DT      0.927     0.856     0.890       118
        B-LC      0.908     0.899     0.904        99
        I-LC      0.000     0.000     0.000         0
        B-OG      0.847     0.632     0.724       114
        I-OG      0.500     0.115     0.188        26
        B-PS      0.918     0.945     0.931       272
        I-PS      0.739     0.708     0.723        24
        B-TI      0.833     0.833     0.833        24
        I-TI      0.700     1.000     0.824         7

   micro avg      0.889     0.813     0.850       771
   macro avg      0.727     0.669     0.680       771
weighted avg      0.881     0.813     0.839       771



In [13]:
i = 1
for (tk, pos, real_bio), prd_bio in list(zip(test_sents[i], y_test[i])):
    if real_bio == 'O' and prd_bio == 'O':
        print ("Token: {} ".format(tk))
    else:
        print ("Token: %-7s ==> Real_BIO: %-5s vs. Prd_BIO: %-5s"%(tk, real_bio, prd_bio))


Token: 민이      ==> Real_BIO: B-PS  vs. Prd_BIO: B-PS 
Token: 할아버지 
Token: 박창덕     ==> Real_BIO: B-PS  vs. Prd_BIO: B-PS 
Token: 은 
Token: , 
Token: 그때 
Token: 신문 
Token: 기자 
Token: 로 
Token: 어느 
Token: 신문사 
Token: 에서 
Token: 일 
Token: 하 
Token: 고 
Token: 있 
Token: 었 
Token: 습니다 
Token: . 




# Appendix. Parameter Tuning
### RandomSearch

In [None]:
%%time
import scipy.stats

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels)

rs = RandomizedSearchCV(crf, params_space, 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=100, 
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

In [None]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))
rs.best_estimator_

In [None]:
_x = [s.get('c1') for s in rs.cv_results_['params']]
_y = [s.get('c2') for s in rs.cv_results_['params']]
_c = list(rs.cv_results_['mean_test_score'])

"""
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

fig = plt.figure()
fig.set_size_inches(12, 12)
ax = plt.gca()
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel('C1')
ax.set_ylabel('C2')
ax.set_title("Randomized Hyperparameter Search CV Results (min={:0.3}, max={:0.3})".format(
    min(_c), max(_c)
))

ax.scatter(_x, _y, c=_c, s=60, alpha=0.9, edgecolors=[0,0,0])
print("Dark blue => {:0.4}, dark red => {:0.4}".format(min(_c), max(_c)))
"""

from pandas import DataFrame
Score = DataFrame(list(zip(_x, _y, _c)), columns = ['c1', 'c2', 'Score']).sort_values(by = 'Score', ascending=False)
Score.head()

In [None]:
crf = rs.best_estimator_

y_pred = crf.predict(X_test)
print (metrics.flat_classification_report(
    y_test, 
    y_pred, 
    labels=sorted_labels, 
    digits=3
    ))