In [1]:
'''reference:
https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html
https://github.com/sea-boat/nlp_lab;
https://blog.csdn.net/wangyangzhizhou/article/details/79907174
'''


from itertools import chain
import nltk
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer
import pycrfsuite


# 1，加载数据：
nltk.download('conll2002')  # 下载nltk提供的命名实体识别语料库
print(nltk.corpus.conll2002.fileids())

model_path = '../OutPut/crf.model'

train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))

print(train_sents[0:3])
print(test_sents[0:3])

[nltk_data] Downloading package conll2002 to /home/jesse/nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


['esp.testa', 'esp.testb', 'esp.train', 'ned.testa', 'ned.testb', 'ned.train']
[[('Melbourne', 'NP', 'B-LOC'), ('(', 'Fpa', 'O'), ('Australia', 'NP', 'B-LOC'), (')', 'Fpt', 'O'), (',', 'Fc', 'O'), ('25', 'Z', 'O'), ('may', 'NC', 'O'), ('(', 'Fpa', 'O'), ('EFE', 'NC', 'B-ORG'), (')', 'Fpt', 'O'), ('.', 'Fp', 'O')], [('-', 'Fg', 'O')], [('El', 'DA', 'O'), ('Abogado', 'NC', 'B-PER'), ('General', 'AQ', 'I-PER'), ('del', 'SP', 'I-PER'), ('Estado', 'NC', 'I-PER'), (',', 'Fc', 'O'), ('Daryl', 'VMI', 'B-PER'), ('Williams', 'NC', 'I-PER'), (',', 'Fc', 'O'), ('subrayó', 'VMI', 'O'), ('hoy', 'RG', 'O'), ('la', 'DA', 'O'), ('necesidad', 'NC', 'O'), ('de', 'SP', 'O'), ('tomar', 'VMN', 'O'), ('medidas', 'NC', 'O'), ('para', 'SP', 'O'), ('proteger', 'VMN', 'O'), ('al', 'SP', 'O'), ('sistema', 'NC', 'O'), ('judicial', 'AQ', 'O'), ('australiano', 'AQ', 'O'), ('frente', 'RG', 'O'), ('a', 'SP', 'O'), ('una', 'DI', 'O'), ('página', 'NC', 'O'), ('de', 'SP', 'O'), ('internet', 'NC', 'O'), ('que', 'PR', 'O')

In [2]:
# 2，定义模型
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i - 1][0]
        postag1 = sent[i - 1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
    if i < len(sent) - 1:
        word1 = sent[i + 1][0]
        postag1 = sent[i + 1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


def sent2labels(sent):
    return [label for token, postag, label in sent]


def sent2tokens(sent):
    return [token for token, postag, label in sent]


def train():
    print(sent2features(train_sents[0])[0])
    X_train = [sent2features(s) for s in train_sents]
    y_train = [sent2labels(s) for s in train_sents]

    trainer = pycrfsuite.Trainer(verbose=False)
    trainer.set_params({
        'c1': 1.0,  # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 50,  # stop earlier
        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })

    print(trainer.params())

    for xseq, yseq in zip(X_train, y_train):
        trainer.append(xseq, yseq)

    trainer.train(model_path)
    print(trainer.logparser.last_iteration)


def predict():
    tagger = pycrfsuite.Tagger()
    tagger.open(model_path)
    example_sent = test_sents[3]
    print(' '.join(sent2tokens(example_sent)), end='\n\n')
    print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
    print("Correct:  ", ' '.join(sent2labels(example_sent)))


def bio_classification_report(y_true, y_pred):
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))

    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels=[class_indices[cls] for cls in tagset],
        target_names=tagset,
    )


def evaluate():
    tagger = pycrfsuite.Tagger()
    tagger.open(model_path)
    X_test = [sent2features(s) for s in test_sents]
    y_test = [sent2labels(s) for s in test_sents]
    y_pred = [tagger.tag(xseq) for xseq in X_test]
    print(bio_classification_report(y_test, y_pred))

In [3]:
# 3，模型训练：

train()

['bias', 'word.lower=melbourne', 'word[-3:]=rne', 'word[-2:]=ne', 'word.isupper=False', 'word.istitle=True', 'word.isdigit=False', 'postag=NP', 'postag[:2]=NP', 'BOS', '+1:word.lower=(', '+1:word.istitle=False', '+1:word.isupper=False', '+1:postag=Fpa', '+1:postag[:2]=Fp']
['feature.minfreq', 'feature.possible_states', 'feature.possible_transitions', 'c1', 'c2', 'max_iterations', 'num_memories', 'epsilon', 'period', 'delta', 'linesearch', 'max_linesearch']
{'num': 50, 'scores': {}, 'loss': 14807.577946, 'feature_norm': 79.110017, 'error_norm': 1262.912078, 'active_features': 11346, 'linesearch_trials': 1, 'linesearch_step': 1.0, 'time': 0.298}


In [4]:
predict()  # 预测

García Aranda presentó a la prensa el sistema Amadeus , que utilizan la mayor parte de las agencias de viajes españolas para reservar billetes de avión o tren , así como plazas de hotel , y que ahora pueden utilizar también los usuarios finales a través de Internet .

Predicted: B-PER I-PER O O O O O O B-MISC O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-MISC O
Correct:   B-PER I-PER O O O O O O B-MISC O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-MISC O


In [5]:
evaluate()  # 评估模型

              precision    recall  f1-score   support

       B-LOC       0.78      0.75      0.76      1084
       I-LOC       0.66      0.60      0.63       325
      B-MISC       0.69      0.47      0.56       339
      I-MISC       0.61      0.49      0.54       557
       B-ORG       0.79      0.81      0.80      1400
       I-ORG       0.80      0.79      0.80      1104
       B-PER       0.82      0.87      0.84       735
       I-PER       0.87      0.93      0.90       634

   micro avg       0.78      0.76      0.77      6178
   macro avg       0.75      0.71      0.73      6178
weighted avg       0.77      0.76      0.76      6178
 samples avg       0.09      0.09      0.09      6178



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
