<a href="https://colab.research.google.com/github/HyeonhoonLee/KoNLP/blob/master/word2feature%2C_NER(sklearn_crfsuite).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os

import sklearn
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers, metrics

def read_ner_data(data_dir:str, file_name:str):
    with open(os.path.join(data_dir, file_name), mode = 'r', encoding= 'utf-8') as f:
        data = [line.strip().splitlines() for line in f.read().split('\n=====\n') if line.strip()]
        data = [[tuple(tk.split('\t')) for tk in line] for line in data]
    return data

data_dir = os.path.join(os.getcwd(), 'data')
print (data_dir)

In [None]:
%time train_sents = list(read_ner_data(data_dir, 'train.txt'))
print ('n_Train Data set: %d\n'%len(train_sents))

%time test_sents = list(read_ner_data(data_dir, 'test.txt'))
print ('n_Test Data set: %d\n'%len(test_sents))

The function below is to get other features of words such as postag and position while vectorzing

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [None]:
from pprint import pprint
i = 5
pprint (train_sents[0][i])

print ('\n========================\n')

pprint (sent2features(train_sents[0])[i])

In [None]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [None]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.05, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

In [None]:
labels = list(crf.classes_)
labels.remove('O')
labels

In [None]:
y_pred = crf.predict(X_test)

f1_score = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)
print ('F1 score: %0.3f'%f1_score)

In [None]:
# group B and I results
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)

print (metrics.flat_classification_report(
    y_test, 
    y_pred, 
    labels=sorted_labels, 
    digits=3
    ))

In [None]:
i = 1
for (tk, pos, real_bio), prd_bio in list(zip(test_sents[i], y_test[i])):
    if real_bio == 'O' and prd_bio == 'O':
        print ("Token: {} ".format(tk))
    else:
        print ("Token: %-7s ==> Real_BIO: %-5s vs. Prd_BIO: %-5s"%(tk, real_bio, prd_bio))


RandomSearch

In [None]:
%%time
import scipy.stats

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels)

rs = RandomizedSearchCV(crf, params_space, 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=100, 
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

In [None]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))
rs.best_estimator_

In [None]:
_x = [s.get('c1') for s in rs.cv_results_['params']]
_y = [s.get('c2') for s in rs.cv_results_['params']]
_c = list(rs.cv_results_['mean_test_score'])

"""
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

fig = plt.figure()
fig.set_size_inches(12, 12)
ax = plt.gca()
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel('C1')
ax.set_ylabel('C2')
ax.set_title("Randomized Hyperparameter Search CV Results (min={:0.3}, max={:0.3})".format(
    min(_c), max(_c)
))

ax.scatter(_x, _y, c=_c, s=60, alpha=0.9, edgecolors=[0,0,0])
print("Dark blue => {:0.4}, dark red => {:0.4}".format(min(_c), max(_c)))
"""

from pandas import DataFrame
Score = DataFrame(list(zip(_x, _y, _c)), columns = ['c1', 'c2', 'Score']).sort_values(by = 'Score', ascending=False)
Score.head()

In [None]:
crf = rs.best_estimator_

y_pred = crf.predict(X_test)
print (metrics.flat_classification_report(
    y_test, 
    y_pred, 
    labels=sorted_labels, 
    digits=3
    ))