<a href="https://colab.research.google.com/github/HaeSeon/nlp-ner/blob/main/%5Bcrf_ner%5Dsklearn_crfsuite_jp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
!pip install sklearn_crfsuite
!pip install eli5
import nltk
import sklearn_crfsuite
import eli5
from sklearn import preprocessing
from itertools import chain
from sklearn.metrics import classification_report, confusion_matrix



**Data**

In [2]:
train_set = 'train_set.txt'
test_set = 'test_set.txt'

In [3]:
f = open(train_set)
f.readlines()[:10]

['1960\tB-DATE\n',
 '年代\tI-DATE\n',
 'と\tO\n',
 '1970\tB-DATE\n',
 '年代\tI-DATE\n',
 'の\tO\n',
 '間\tO\n',
 'に\tO\n',
 '、\tO\n',
 'ジョエル\tB-PERSON\n']

In [5]:
def process_data(file_name):
  sentence=[]
  sentences=[]
  with open (file_name,'r') as f:
    for line in f.readlines():
      if "。" in line and len(sentence)>0:
        sentences.append(sentence)
        sentence=[]
      else:
        if(len(line.split('\t'))>1):
          word=line.split('\t')[0]
          tag=line.split('\t')[1]
          sentence.append((word, tag))

  return sentences

In [6]:
train_sents=process_data(train_set)
test_sents=process_data(test_set)

In [7]:
train_sents[0]

[('1960', 'B-DATE\n'),
 ('年代', 'I-DATE\n'),
 ('と', 'O\n'),
 ('1970', 'B-DATE\n'),
 ('年代', 'I-DATE\n'),
 ('の', 'O\n'),
 ('間', 'O\n'),
 ('に', 'O\n'),
 ('、', 'O\n'),
 ('ジョエル', 'B-PERSON\n'),
 ('・', 'I-PERSON\n'),
 ('モーゼス', 'I-PERSON\n'),
 ('は', 'O\n'),
 (' ', 'O\n'),
 ('プログラム', 'O\n'),
 ('中', 'O\n'),
 ('で', 'O\n'),
 ('積分', 'O\n'),
 ('問題', 'O\n'),
 ('で', 'O\n'),
 ('の', 'O\n'),
 ('記号', 'O\n'),
 ('的', 'O\n'),
 ('推論', 'O\n'),
 ('の', 'O\n'),
 ('パワー', 'O\n'),
 ('を', 'O\n'),
 ('示し', 'O\n'),
 ('た', 'O\n')]

**Feature extraction**

In [27]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features



In [28]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token,  label in sent]



In [29]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [30]:
X_train[0][1]


{'+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:word.lower()': 'と',
 '-1:word.istitle()': False,
 '-1:word.isupper()': False,
 '-1:word.lower()': '1960',
 'bias': 1.0,
 'word.isdigit()': False,
 'word.istitle()': False,
 'word.isupper()': False,
 'word.lower()': '年代',
 'word[-2:]': '年代',
 'word[-3:]': '年代'}

**train CRF model** 

In [33]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=20,
    all_possible_transitions=False,
)
crf.fit(X_train, y_train);

In [32]:

eli5.show_weights(crf, top=10)

AttributeError: ignored

In [34]:
eli5.show_weights(crf, top=10, targets=['O', 'B-ORG', 'I-ORG'])

AttributeError: ignored

In [38]:
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb=preprocessing.LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [36]:
y_true = y_test
y_pred = []
for sent in test_sents:
    y_pred.append(crf.predict_single(sent2features(sent)))

In [37]:
bio_classification_report(y_true, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


'                 precision    recall  f1-score   support\n\n    B-ARTIFACT\n       0.74      0.29      0.42       169\n    I-ARTIFACT\n       0.54      0.37      0.44       320\n        B-DATE\n       0.91      0.83      0.87       305\n        I-DATE\n       0.94      0.92      0.93       470\n       B-EVENT\n       0.85      0.28      0.42        40\n       I-EVENT\n       0.77      0.40      0.52        86\n    B-LOCATION\n       0.79      0.33      0.46       588\n    I-LOCATION\n       0.61      0.32      0.42       318\n       B-MONEY\n       0.00      0.00      0.00        22\n       I-MONEY\n       1.00      0.10      0.18        40\n      B-NUMBER\n       0.65      0.62      0.63       203\n      I-NUMBER\n       0.63      0.67      0.65       252\n             O\n       0.92      0.98      0.95     23268\nB-ORGANIZATION\n       0.61      0.26      0.37       341\nI-ORGANIZATION\n       0.44      0.41      0.43       389\n       B-OTHER\n       0.60      0.33      0.43       