In [None]:
!wget https://raw.githubusercontent.com/Rexhaif/ner-dialogues-hackathon/master/data/train.conll
!wget https://raw.githubusercontent.com/Rexhaif/ner-dialogues-hackathon/master/data/dev.conll
!wget https://raw.githubusercontent.com/Rexhaif/ner-dialogues-hackathon/master/data/test.conll

In [1]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import scipy
import eli5
import pymorphy2
import spacy
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
morph = pymorphy2.MorphAnalyzer()

In [4]:
with open('train.conll') as train_f, open('test.conll') as test_f:
    train_str = train_f.read()
    test_str = test_f.read()

In [5]:
def get_word_pos(word, sent):
    pos = morph.parse(word[0])[0].tag.POS
    if not pos:
        i = sent.index(word)
        sent = ' '.join([tok[0] for tok in sent]).capitalize()
        doc = nlp(sent)
        pos = doc[i].pos_
    return pos

In [6]:
def parse_conll(conll_str):
    sents = [tuple(sent.split()) for sent in conll_str.split('\n\n')]
    splitted_sents = []
    for sent in sents:
        i = iter(sent)
        splitted_sents.append(list(zip(i, i)))
    parsed_sents = [[(word[0], get_word_pos(word, sent), word[1])
                     for word in sent] for sent in splitted_sents]
    return parsed_sents

In [7]:
train_sents = parse_conll(train_str)
test_sents = parse_conll(test_str)

In [31]:
train_sents

[[('спой', 'VERB', 'O'),
  ('из', 'PREP', 'O'),
  ('шоколад', 'NOUN', 'B-FILM'),
  ('песню', 'NOUN', 'O'),
  ('LItaliano', 'NOUN', 'B-SONG')],
 [('давай', 'VERB', 'O'),
  ('послушаем', 'VERB', 'O'),
  ('дэнан', 'NOUN', 'B-SINGER')],
 [('вруби', 'VERB', 'O'),
  ('The', 'DET', 'B-SONG'),
  ('House', 'NOUN', 'I-SONG'),
  ('Of', 'ADP', 'I-SONG'),
  ('The', 'DET', 'I-SONG'),
  ('Rising', 'VERB', 'I-SONG'),
  ('Sun', 'NOUN', 'I-SONG')],
 [('играй', 'VERB', 'O'),
  ('мэззи', 'NOUN', 'B-SINGER'),
  ('ста', 'NUMR', 'I-SINGER')],
 [('воспроизведи', 'VERB', 'O'), ('бородин', 'NOUN', 'B-COMPOSER')],
 [('включи', 'VERB', 'O')],
 [('спой', 'VERB', 'O'),
  ('джордж', 'NOUN', 'B-COMPOSER'),
  ('гершвин', 'NOUN', 'I-COMPOSER')],
 [('расскажи', 'VERB', 'O'), ('мва', 'NOUN', 'B-SONG')],
 [('давай', 'VERB', 'O'), ('Total', 'ADV', 'B-SINGER')],
 [('проиграй', 'VERB', 'O'), ('уэмом', 'NOUN', 'B-SINGER')],
 [('включи', 'VERB', 'O'),
  ('в', 'PREP', 'B-SONG'),
  ('последнюю', 'ADJF', 'I-SONG'),
  ('осень', 'N

In [32]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [33]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

In [34]:
X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [35]:
X_train[0][1]

{'bias': 1.0,
 'word.lower()': 'из',
 'word[-3:]': 'из',
 'word.isupper()': False,
 'word.istitle()': False,
 'word.isdigit()': False,
 'postag': 'PREP',
 'postag[:2]': 'PR',
 '-1:word.lower()': 'спой',
 '-1:word.istitle()': False,
 '-1:word.isupper()': False,
 '-1:postag': 'VERB',
 '-1:postag[:2]': 'VE',
 '+1:word.lower()': 'шоколад',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:postag': 'NOUN',
 '+1:postag[:2]': 'NO'}

In [36]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=20,
    all_possible_transitions=False,
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_transitions=False, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=20)

In [37]:
eli5.show_weights(crf, top=30)

From \ To,O,B-BOOK,I-BOOK,B-COMPOSER,I-COMPOSER,B-FILM,I-FILM,B-SINGER,I-SINGER,B-SONG,I-SONG
O,0.707,2.192,0.0,2.277,0.0,3.558,0.0,4.117,0.0,5.922,0.0
B-BOOK,0.0,0.0,6.894,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-BOOK,0.0,0.0,5.476,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-COMPOSER,-0.176,0.0,0.0,0.0,3.701,0.0,0.0,0.0,0.0,0.0,0.0
I-COMPOSER,0.005,0.0,0.0,0.0,2.87,0.0,0.0,0.0,0.0,0.0,0.0
B-FILM,-0.116,0.0,0.0,0.0,0.0,0.0,6.463,0.0,0.0,0.0,0.0
I-FILM,-0.236,0.0,0.0,0.0,0.0,0.0,5.112,0.0,0.0,0.0,0.0
B-SINGER,-0.528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.736,0.0,0.0
I-SINGER,0.262,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.537,0.0,0.0
B-SONG,-0.491,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.266

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10
+5.018,word[-3:]:ора,,,,,,,,,
+4.787,word[-3:]:сню,,,,,,,,,
+4.787,word.lower():песню,,,,,,,,,
+4.660,BOS,,,,,,,,,
+4.329,word.lower():композитора,,,,,,,,,
+3.658,word.lower():включи,,,,,,,,,
+3.658,word[-3:]:ючи,,,,,,,,,
+3.129,word.lower():послушаем,,,,,,,,,
+3.028,word.lower():автора,,,,,,,,,
+2.729,word[-3:]:аем,,,,,,,,,

Weight?,Feature
+5.018,word[-3:]:ора
+4.787,word[-3:]:сню
+4.787,word.lower():песню
+4.660,BOS
+4.329,word.lower():композитора
+3.658,word.lower():включи
+3.658,word[-3:]:ючи
+3.129,word.lower():послушаем
+3.028,word.lower():автора
+2.729,word[-3:]:аем

Weight?,Feature
+1.516,-1:postag[:2]:VE
+1.516,-1:postag:VERB
+1.172,-1:word.lower():книгу
+0.908,-1:word.lower():аудиокнигу
+0.886,word[-3:]:ени
+0.591,+1:word.lower():и
+0.587,+1:postag:CONJ
+0.581,+1:postag[:2]:CO
+0.476,postag:ADJF
+0.455,word[-3:]:лик

Weight?,Feature
+1.097,-1:postag[:2]:PR
+1.095,-1:postag:PREP
+0.987,-1:postag:ADJS
+0.986,-1:postag[:2]:AD
+0.762,-1:postag:CONJ
+0.759,-1:postag[:2]:CO
+0.636,-1:word.lower():и
+0.593,postag:ADJF
+0.569,postag:PREP
+0.458,postag:CONJ

Weight?,Feature
+5.740,-1:word.lower():композитора
+0.907,postag[:2]:NO
+0.907,postag:NOUN
+0.819,word.lower():серге
+0.819,word[-3:]:рге
+0.777,word[-3:]:ола
+0.777,word.lower():никола
+0.672,word.lower():антон
+0.606,word[-3:]:тон
+0.583,-1:postag[:2]:VE

Weight?,Feature
+1.808,word[-3:]:вич
+1.332,-1:postag:NOUN
+1.332,-1:postag[:2]:NO
+0.828,-1:word.lower():серге
+0.788,-1:word.lower():никола
+0.731,-1:word.lower():александр
+0.695,+1:word.lower():песню
+0.678,-1:word.lower():антон
+0.651,word.lower():рахманинов
+0.554,word.lower():мясковск

Weight?,Feature
+1.871,-1:word.lower():из
+1.536,-1:word.lower():фильм
+1.307,-1:word.lower():фильма
+1.133,-1:postag:PREP
+0.860,word[-3:]:тор
+0.850,+1:word.lower():песню
+0.830,-1:postag[:2]:PR
+0.807,word.lower():втор
+0.778,postag[:2]:AD
+0.587,+1:postag:PREP

Weight?,Feature
+0.866,-1:postag:PREP
+0.799,-1:postag[:2]:PR
+0.762,-1:postag:ADVB
+0.722,-1:postag:CONJ
+0.718,-1:postag[:2]:CO
+0.665,bias
+0.654,word.lower():два
+0.654,word[-3:]:два
+0.643,+1:postag:NUMR
+0.634,+1:postag[:2]:NU

Weight?,Feature
+7.311,-1:word.lower():автора
+3.147,-1:word.lower():певца
+1.431,word.istitle()
+1.067,-1:postag:VERB
+1.067,-1:postag[:2]:VE
+0.599,-1:word.lower():зажги
+0.558,EOS
+0.473,word[-3:]:мин
+0.448,+1:word.lower():песню
+0.381,word.lower():владимир

Weight?,Feature
+2.339,+1:word.lower():песню
+2.287,-1:word.istitle()
+1.808,word.istitle()
+0.882,-1:postag[:2]:AD
+0.816,EOS
+0.603,-1:postag[:2]:NO
+0.603,-1:postag:NOUN
+0.569,postag:ADVB
+0.550,postag[:2]:AD
+0.542,-1:postag:ADJ

Weight?,Feature
+7.641,-1:word.lower():песню
+2.516,word.istitle()
+1.631,-1:word.lower():дорожку
+1.521,-1:word.lower():произведение
+0.885,word[-3:]:Les
+0.885,word.lower():les
+0.798,word[-3:]:сан
+0.584,word.lower():litaliano
+0.584,word[-3:]:ano
+0.567,word[-3:]:ань

Weight?,Feature
+2.797,+1:word.lower():автора
+2.392,-1:word.istitle()
+1.393,-1:postag[:2]:IN
+1.225,-1:postag:ADJF
+1.142,-1:postag:INTJ
+0.989,-1:postag:NPRO
+0.989,-1:postag[:2]:NP
+0.935,word.istitle()
+0.934,bias
+0.884,EOS


In [38]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-FILM',
 'B-SONG',
 'B-SINGER',
 'I-SONG',
 'I-SINGER',
 'B-COMPOSER',
 'I-COMPOSER',
 'I-FILM',
 'B-BOOK',
 'I-BOOK']

In [39]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.6235860877360649

In [40]:
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

      B-BOOK      0.320     0.219     0.260        73
      I-BOOK      0.381     0.416     0.398        89
  B-COMPOSER      0.889     0.457     0.604        35
  I-COMPOSER      0.667     0.571     0.615        21
      B-FILM      0.720     0.261     0.383        69
      I-FILM      0.724     0.356     0.477        59
    B-SINGER      0.588     0.901     0.711       253
    I-SINGER      0.581     0.782     0.667       147
      B-SONG      0.831     0.594     0.693       165
      I-SONG      0.865     0.734     0.794       192

   micro avg      0.636     0.636     0.636      1103
   macro avg      0.657     0.529     0.560      1103
weighted avg      0.664     0.636     0.624      1103





In [41]:
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=50, 
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   14.5s finished


RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True,
                                 keep_tempfiles=None, max_iterations=100),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x15e351cd0>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x15f451d00>},
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['B-FILM', 'B-SONG', 'B-SINGER', 'I-SONG', 'I-SINGER', 'B-COMPOSER', 'I-COMPOSER', 'I-FILM', 'B-BOOK', 'I-BOOK']),
                   verbose=1)

In [42]:
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.0019015394428813202, 'c2': 0.1886651360699415}
best CV score: 0.7483519860993625
model size: 0.68M


In [43]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

      B-BOOK      0.553     0.288     0.378        73
      I-BOOK      0.662     0.506     0.573        89
  B-COMPOSER      0.818     0.514     0.632        35
  I-COMPOSER      0.733     0.524     0.611        21
      B-FILM      0.707     0.420     0.527        69
      I-FILM      0.689     0.525     0.596        59
    B-SINGER      0.609     0.874     0.718       253
    I-SINGER      0.597     0.776     0.675       147
      B-SONG      0.809     0.642     0.716       165
      I-SONG      0.859     0.823     0.840       192

   micro avg      0.687     0.684     0.685      1103
   macro avg      0.704     0.589     0.627      1103
weighted avg      0.701     0.684     0.676      1103



In [44]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-SINGER -> I-SINGER 5.548490
B-SONG -> I-SONG  5.023144
I-SONG -> I-SONG  4.905125
B-BOOK -> I-BOOK  4.635808
B-FILM -> I-FILM  4.413896
I-FILM -> I-FILM  4.326321
I-BOOK -> I-BOOK  4.210976
I-SINGER -> I-SINGER 3.860774
B-COMPOSER -> I-COMPOSER 3.822669
I-COMPOSER -> I-COMPOSER 3.151252
O      -> B-SONG  2.891483
O      -> B-FILM  2.598696
O      -> B-SINGER 1.904074
O      -> B-BOOK  1.624749
O      -> B-COMPOSER 1.461307
I-SONG -> O       0.471503
I-SINGER -> O       0.151076
B-SINGER -> O       0.138368
B-COMPOSER -> O       -0.023949
O      -> O       -0.061645

Top unlikely transitions:
B-BOOK -> I-FILM  -0.801595
B-COMPOSER -> I-BOOK  -0.860410
I-SONG -> I-SINGER -0.866745
B-SONG -> I-SINGER -0.879142
B-SONG -> I-FILM  -0.917930
B-COMPOSER -> I-FILM  -0.920273
B-BOOK -> O       -0.937964
B-FILM -> I-BOOK  -0.970867
B-SONG -> I-BOOK  -0.972147
B-SINGER -> I-BOOK  -1.000933
B-SINGER -> I-FILM  -1.016942
B-SINGER -> I-COMPOSER -1.108503
I-SINGER -> I-SONG  