In [None]:
!wget https://raw.githubusercontent.com/Rexhaif/ner-dialogues-hackathon/master/data/train.conll
!wget https://raw.githubusercontent.com/Rexhaif/ner-dialogues-hackathon/master/data/test.conll

In [None]:
! wget https://storage.yandexcloud.net/natasha-slovnet/packs/slovnet_syntax_news_v1.tar
! wget https://storage.yandexcloud.net/natasha-navec/packs/navec_news_v1_1B_250K_300d_100q.tar

In [2]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import scipy
import eli5
import pymorphy2
import spacy
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from navec import Navec
from slovnet import Syntax
from razdel import tokenize

In [None]:
navec = Navec.load('navec_news_v1_1B_250K_300d_100q.tar')
syntax = Syntax.load('slovnet_syntax_news_v1.tar')
syntax.navec(navec)

In [4]:
en_nlp = spacy.load("en_core_web_sm")

morph = pymorphy2.MorphAnalyzer()

In [5]:
with open('train.conll') as train_f, open('test.conll') as test_f:
    train_str = train_f.read()
    test_str = test_f.read()

In [6]:
def get_word_pos(word, sent, str_sent):
    pos = morph.parse(word[0])[0].tag.POS
    if not pos:
        i = sent.index(word)
        doc = en_nlp(str_sent)
        pos = doc[i].pos_
    return pos

In [7]:
def get_sent_syntax(sent):
    tokens = [_.text for _ in tokenize(sent)]
    markup = syntax(tokens)
    return [token.rel for token in markup.tokens]

In [8]:
def parse_conll(conll_str):
    sents = [tuple(sent.split()) for sent in conll_str.split('\n\n')]
    splitted_sents = []
    for i, sent in enumerate(sents):
        it = iter(sent)
        splitted_sents.append(list(zip(it, it)))

    parsed_sents = []
    for sent in splitted_sents:
        str_sent = ' '.join([tok[0] for tok in sent]).capitalize()
        if str_sent:
            parsed_sents.append([(word[0], get_word_pos(word, sent, str_sent), rel, word[1])
                                 for word, rel in zip(sent, get_sent_syntax(str_sent))])

    return parsed_sents

In [9]:
train_sents = parse_conll(train_str)
test_sents = parse_conll(test_str)

In [10]:
train_sents

[[('спой', 'VERB', 'acl', 'O'),
  ('из', 'PREP', 'case', 'O'),
  ('шоколад', 'NOUN', 'obl', 'B-FILM'),
  ('песню', 'NOUN', 'appos', 'O'),
  ('LItaliano', 'NOUN', 'appos', 'B-SONG')],
 [('давай', 'VERB', 'advmod', 'O'),
  ('послушаем', 'VERB', 'root', 'O'),
  ('дэнан', 'NOUN', 'obj', 'B-SINGER')],
 [('вруби', 'VERB', 'root', 'O'),
  ('The', 'DET', 'appos', 'B-SONG'),
  ('House', 'NOUN', 'flat:foreign', 'I-SONG'),
  ('Of', 'ADP', 'flat:foreign', 'I-SONG'),
  ('The', 'DET', 'flat:foreign', 'I-SONG'),
  ('Rising', 'VERB', 'flat:foreign', 'I-SONG'),
  ('Sun', 'NOUN', 'flat:foreign', 'I-SONG')],
 [('играй', 'VERB', 'root', 'O'),
  ('мэззи', 'NOUN', 'obj', 'B-SINGER'),
  ('ста', 'NUMR', 'nummod', 'I-SINGER')],
 [('воспроизведи', 'VERB', 'amod', 'O'),
  ('бородин', 'NOUN', 'root', 'B-COMPOSER')],
 [('включи', 'VERB', 'root', 'O')],
 [('спой', 'VERB', 'amod', 'O'),
  ('джордж', 'NOUN', 'amod', 'B-COMPOSER'),
  ('гершвин', 'NOUN', 'appos', 'I-COMPOSER')],
 [('расскажи', 'VERB', 'root', 'O'), ('м

In [11]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    relteg = sent[i][2]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
        'reltag': relteg,
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        relteg1 = sent[i-1][2]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-1:reltag': relteg1,
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        relteg1 = sent[i+1][2]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+1:reltag': relteg1,
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, reltag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, reltag, label in sent]

In [12]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

In [13]:
X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [14]:
X_train[0][1]

{'bias': 1.0,
 'word.lower()': 'из',
 'word[-3:]': 'из',
 'word.isupper()': False,
 'word.istitle()': False,
 'word.isdigit()': False,
 'postag': 'PREP',
 'postag[:2]': 'PR',
 'reltag': 'case',
 '-1:word.lower()': 'спой',
 '-1:word.istitle()': False,
 '-1:word.isupper()': False,
 '-1:postag': 'VERB',
 '-1:postag[:2]': 'VE',
 '-1:reltag': 'acl',
 '+1:word.lower()': 'шоколад',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:postag': 'NOUN',
 '+1:postag[:2]': 'NO',
 '+1:reltag': 'obl'}

In [15]:
# ! pip install -U 'scikit-learn<0.24'

In [16]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=20,
    all_possible_transitions=False,
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=False, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=20)

In [17]:
eli5.show_weights(crf, top=30)

From \ To,O,B-BOOK,I-BOOK,B-COMPOSER,I-COMPOSER,B-FILM,I-FILM,B-SINGER,I-SINGER,B-SONG,I-SONG
O,0.476,2.173,0.0,2.052,0.0,3.787,0.0,2.946,0.0,3.077,0.0
B-BOOK,0.0,0.0,6.031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-BOOK,0.0,0.0,5.598,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-COMPOSER,-0.118,0.0,0.0,0.0,4.805,0.0,0.0,0.0,0.0,0.0,0.0
I-COMPOSER,0.01,0.0,0.0,0.0,2.761,0.0,0.0,0.0,0.0,0.0,0.0
B-FILM,-0.225,0.0,0.0,0.0,0.0,0.0,6.22,0.0,0.0,0.0,0.0
I-FILM,-0.303,0.0,0.0,0.0,0.0,0.0,5.176,0.0,0.0,0.0,0.0
B-SINGER,-0.349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.036,0.0,0.0
I-SINGER,0.34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.779,0.0,0.0
B-SONG,-0.518,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.518

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10
+5.201,BOS,,,,,,,,,
+4.635,word[-3:]:ора,,,,,,,,,
+4.549,word[-3:]:сню,,,,,,,,,
+4.549,word.lower():песню,,,,,,,,,
+4.194,word.lower():композитора,,,,,,,,,
+3.072,word.lower():включи,,,,,,,,,
+3.072,word[-3:]:ючи,,,,,,,,,
+2.937,word.lower():автора,,,,,,,,,
+2.455,reltag:nmod,,,,,,,,,
+2.418,word[-3:]:вца,,,,,,,,,

Weight?,Feature
+5.201,BOS
+4.635,word[-3:]:ора
+4.549,word[-3:]:сню
+4.549,word.lower():песню
+4.194,word.lower():композитора
+3.072,word.lower():включи
+3.072,word[-3:]:ючи
+2.937,word.lower():автора
+2.455,reltag:nmod
+2.418,word[-3:]:вца

Weight?,Feature
+1.231,-1:word.lower():книгу
+0.990,-1:reltag:obj
+0.954,-1:word.lower():аудиокнигу
+0.927,word[-3:]:ени
+0.718,-1:postag:VERB
+0.718,-1:postag[:2]:VE
+0.526,+1:postag:ADJS
+0.517,+1:postag:CONJ
+0.512,+1:postag[:2]:CO
+0.502,word[-3:]:лик

Weight?,Feature
+0.912,reltag:obl
+0.758,-1:postag:ADJS
+0.751,-1:postag:CONJ
+0.749,-1:postag[:2]:CO
+0.733,-1:postag[:2]:AD
+0.588,bias
+0.517,-1:word.lower():в
+0.514,EOS
+0.511,+1:reltag:obl
+0.509,postag:CONJ

Weight?,Feature
+5.257,-1:word.lower():композитора
+0.902,-1:reltag:nmod
+0.891,word[-3:]:рге
+0.891,word.lower():серге
+0.775,word.lower():никола
+0.775,word[-3:]:ола
+0.761,+1:reltag:appos
+0.681,word.lower():антон
+0.646,-1:word.lower():включи
+0.639,word[-3:]:ком

Weight?,Feature
+1.588,-1:postag:NOUN
+1.588,-1:postag[:2]:NO
+1.083,-1:reltag:appos
+1.067,word[-3:]:вич
+1.003,reltag:appos
+0.894,-1:word.lower():серге
+0.865,-1:reltag:nsubj
+0.775,-1:word.lower():никола
+0.729,+1:reltag:obj
+0.674,-1:word.lower():антон

Weight?,Feature
+1.630,-1:word.lower():фильм
+1.374,-1:word.lower():фильма
+1.329,-1:word.lower():из
+0.894,word[-3:]:тор
+0.881,postag[:2]:AD
+0.852,word.lower():втор
+0.808,-1:postag:PREP
+0.780,-1:reltag:case
+0.710,-1:postag[:2]:PR
+0.593,+1:reltag:obl

Weight?,Feature
+0.831,-1:postag:ADVB
+0.758,-1:postag:CONJ
+0.757,reltag:conj
+0.754,-1:postag[:2]:CO
+0.700,word[-3:]:два
+0.700,word.lower():два
+0.696,bias
+0.692,reltag:cc
+0.658,-1:postag[:2]:NO
+0.658,-1:postag:NOUN

Weight?,Feature
+7.198,-1:word.lower():автора
+3.203,-1:word.lower():певца
+1.285,word.istitle()
+1.067,-1:reltag:nmod
+0.879,-1:postag[:2]:VE
+0.879,-1:postag:VERB
+0.597,reltag:appos
+0.482,-1:reltag:root
+0.478,word[-3:]:мин
+0.467,reltag:advmod

Weight?,Feature
+1.646,reltag:flat:foreign
+1.607,+1:word.lower():песню
+1.576,-1:word.istitle()
+1.253,word.istitle()
+1.046,-1:reltag:nsubj
+0.981,-1:postag[:2]:AD
+0.862,EOS
+0.671,-1:postag:NOUN
+0.671,-1:postag[:2]:NO
+0.601,-1:postag:ADJ

Weight?,Feature
+7.295,-1:word.lower():песню
+1.552,-1:word.lower():дорожку
+1.440,-1:word.lower():произведение
+1.078,-1:reltag:obj
+1.021,word.istitle()
+0.951,-1:postag[:2]:VE
+0.951,-1:postag:VERB
+0.846,word[-3:]:Les
+0.846,word.lower():les
+0.801,word[-3:]:сан

Weight?,Feature
+2.833,+1:word.lower():автора
+1.355,reltag:flat:foreign
+1.283,-1:word.istitle()
+1.207,-1:postag:INTJ
+1.164,word.istitle()
+1.102,-1:postag[:2]:IN
+1.097,EOS
+0.823,bias
+0.794,-1:word.lower():je
+0.774,-1:word.lower():les


In [18]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-FILM',
 'B-SONG',
 'B-SINGER',
 'I-SONG',
 'I-SINGER',
 'B-COMPOSER',
 'I-COMPOSER',
 'I-FILM',
 'B-BOOK',
 'I-BOOK']

In [19]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.5943094723945032

In [20]:
y_test

[['O', 'B-SINGER', 'I-SINGER'],
 ['O', 'B-SINGER'],
 ['O', 'B-SONG'],
 ['O', 'B-FILM', 'I-FILM', 'I-FILM'],
 ['O', 'B-BOOK', 'I-BOOK'],
 ['O', 'O', 'B-SINGER', 'I-SINGER', 'I-SINGER', 'O', 'B-SONG'],
 ['O', 'B-COMPOSER'],
 ['O', 'B-SINGER', 'I-SINGER'],
 ['O', 'B-SINGER', 'I-SINGER', 'I-SINGER'],
 ['O', 'B-SINGER'],
 ['O', 'B-BOOK', 'I-BOOK'],
 ['O', 'B-SINGER'],
 ['O', 'B-SINGER'],
 ['O', 'O', 'B-SINGER', 'I-SINGER', 'O', 'B-SONG'],
 ['O', 'B-SINGER'],
 ['O', 'O', 'O', 'B-SINGER', 'I-SINGER', 'I-SINGER'],
 ['O', 'B-SINGER', 'I-SINGER'],
 ['O', 'O', 'B-SONG', 'I-SONG', 'I-SONG', 'I-SONG', 'I-SONG'],
 ['O', 'O', 'B-SINGER', 'O', 'B-SONG', 'I-SONG', 'I-SONG', 'I-SONG', 'I-SONG'],
 ['O', 'B-SONG', 'I-SONG', 'I-SONG'],
 ['O', 'B-SONG'],
 ['O', 'O', 'B-SINGER', 'I-SINGER', 'O', 'B-SONG', 'I-SONG'],
 ['O', 'B-FILM', 'I-FILM', 'I-FILM', 'I-FILM'],
 ['O', 'B-COMPOSER'],
 ['O', 'B-SINGER', 'I-SINGER'],
 ['O', 'B-SONG'],
 ['O', 'O', 'B-SONG'],
 ['O', 'O', 'B-SONG', 'I-SONG'],
 ['O', 'B-SINGER'],

In [21]:
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

      B-BOOK      0.281     0.123     0.171        73
      I-BOOK      0.214     0.101     0.137        89
  B-COMPOSER      0.655     0.543     0.594        35
  I-COMPOSER      0.600     0.714     0.652        21
      B-FILM      0.463     0.362     0.407        69
      I-FILM      0.359     0.475     0.409        59
    B-SINGER      0.620     0.806     0.701       253
    I-SINGER      0.686     0.653     0.669       147
      B-SONG      0.715     0.655     0.684       165
      I-SONG      0.759     0.870     0.811       192

   micro avg      0.618     0.617     0.617      1103
   macro avg      0.535     0.530     0.523      1103
weighted avg      0.589     0.617     0.594      1103





In [22]:
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=10, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=50, 
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   23.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   59.7s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.2min finished


RandomizedSearchCV(cv=10,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True,
                                 keep_tempfiles=None, max_iterations=100),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x16854a4f0>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x13f7e34f0>},
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['B-FILM', 'B-SONG', 'B-SINGER', 'I-SONG', 'I-SINGER', 'B-COMPOSER', 'I-COMPOSER', 'I-FILM', 'B-BOOK', 'I-BOOK']),
                   verbose=1)

In [25]:
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.002877533268880202, 'c2': 0.04559724830895078}
best CV score: 0.771982061689148
model size: 0.60M


(Лучший результат был примерно 0.776)

In [26]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

      B-BOOK      0.462     0.247     0.321        73
      I-BOOK      0.528     0.315     0.394        89
  B-COMPOSER      0.750     0.514     0.610        35
  I-COMPOSER      0.733     0.524     0.611        21
      B-FILM      0.630     0.420     0.504        69
      I-FILM      0.526     0.508     0.517        59
    B-SINGER      0.626     0.842     0.718       253
    I-SINGER      0.628     0.782     0.697       147
      B-SONG      0.753     0.667     0.707       165
      I-SONG      0.821     0.833     0.827       192

   micro avg      0.667     0.664     0.665      1103
   macro avg      0.646     0.565     0.591      1103
weighted avg      0.662     0.664     0.651      1103



In [28]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-SINGER -> I-SINGER 6.852154
B-SONG -> I-SONG  6.251669
I-SONG -> I-SONG  6.057899
B-BOOK -> I-BOOK  5.511594
B-FILM -> I-FILM  5.388728
I-BOOK -> I-BOOK  5.308638
I-FILM -> I-FILM  5.214553
B-COMPOSER -> I-COMPOSER 4.242289
I-SINGER -> I-SINGER 4.193667
O      -> B-FILM  3.897535
I-COMPOSER -> I-COMPOSER 3.541044
O      -> B-SONG  3.162245
O      -> B-SINGER 3.042782
O      -> B-BOOK  1.940641
O      -> B-COMPOSER 1.682725
I-SONG -> O       0.780802
B-SINGER -> O       0.526489
O      -> O       0.315762
I-SINGER -> O       0.262729
B-COMPOSER -> O       0.011481

Top unlikely transitions:
I-SINGER -> I-FILM  -0.835057
B-SONG -> I-SINGER -0.836433
B-FILM -> I-SONG  -0.849482
B-COMPOSER -> I-BOOK  -0.854179
B-SINGER -> I-BOOK  -0.887077
B-SINGER -> I-SONG  -0.929446
I-SINGER -> I-COMPOSER -0.936750
I-BOOK -> O       -0.974832
B-COMPOSER -> B-COMPOSER -0.982678
B-FILM -> I-BOOK  -1.000578
B-SINGER -> I-COMPOSER -1.040957
B-COMPOSER -> I-SINGER -1.050220
B-BOOK -

In [29]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
6.916881 B-SINGER -1:word.lower():автора
6.405713 B-SONG   -1:word.lower():песню
5.227783 B-COMPOSER -1:word.lower():композитора
4.786432 B-SINGER -1:word.lower():певца
4.106429 O        word[-3:]:ора
4.095130 B-FILM   -1:word.lower():фильм
3.871496 O        BOS
3.647503 B-FILM   word.lower():втор
3.625112 O        word.lower():песню
3.625112 O        word[-3:]:сню
3.532770 B-FILM   -1:word.lower():фильма
3.232909 B-BOOK   -1:word.lower():книгу
3.201462 O        word.lower():композитора
3.133674 B-BOOK   word.lower():разгром
3.092082 B-COMPOSER word.lower():листом
3.078853 O        word.lower():включи
3.078853 O        word[-3:]:ючи
3.065089 O        word[-3:]:игу
2.816466 B-SINGER word.lower():маикалсан
2.765500 B-FILM   word.lower():тора
2.752869 B-FILM   -1:word.lower():из
2.736680 B-BOOK   word.lower():одиночеств
2.710827 B-FILM   word.lower():бессонниц
2.705285 B-FILM   word.lower():малена
2.696054 B-COMPOSER word.lower():равел
2.683580 B-BOOK   word[-3:]:ках
2.64466