In [17]:
import json
import sklearn_crfsuite

In [39]:
def word2features(sent, i):
    features = {
        'word': sent[i]['text'].encode('utf-8'),
        'sent_len': len(sent),
        'pos_in_sent': i,
        'upos': sent[i]['upos'],
        'xpos': sent[i]['xpos'],
        'first_word_in_sent': sent[0]['text'].encode('utf-8')
    }

    if i > 0:
        features.update({
            'prev_word': sent[i-1]['text'].encode('utf-8')
        })
    else:
        features.update({
            'BOS': True
        })

    if i < len(sent)-1:
        features.update({
            'next_word': sent[i+1]['text'].encode('utf-8')
        })
    else:
        features.update({
            'EOS': True
        })

    return features

In [20]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [42]:
def word2label(sent, i):
    if i < len(sent) - 1:
        if sent[i+1]['upos'] == 'PUNCT':
            return sent[i+1]['text'].encode('utf-8')
        else:
            return ''
    else:
        return ''

In [22]:
def sent2labels(sent):
    return [word2label(sent, i) for i in range(len(sent))]

In [32]:
f = open('../data/Bible/processed/Bibliia_clean_dev.json', "r", encoding="utf-8")

In [35]:
data = json.load(f)

In [36]:
data[0]

[{'id': 1, 'text': 'И', 'upos': 'CCONJ', 'xpos': 'Cp'},
 {'id': 2,
  'text': 'когато',
  'upos': 'ADV',
  'xpos': 'Prt',
  'feats': 'PronType=Rel'},
 {'id': 3,
  'text': 'станах',
  'upos': 'VERB',
  'xpos': 'Vppif-o1s',
  'feats': 'Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Tense=Past|VerbForm=Fin|Voice=Act'},
 {'id': 4, 'text': 'на', 'upos': 'ADP', 'xpos': 'R'},
 {'id': 5,
  'text': 'сутринта',
  'upos': 'NOUN',
  'xpos': 'Ncfsd',
  'feats': 'Definite=Def|Gender=Fem|Number=Sing',
  'misc': 'SpaceAfter=No'},
 {'id': 6, 'text': ',', 'upos': 'PUNCT', 'xpos': 'punct'},
 {'id': 7, 'text': 'за', 'upos': 'ADP', 'xpos': 'R'},
 {'id': 8, 'text': 'да', 'upos': 'AUX', 'xpos': 'Tx'},
 {'id': 9,
  'text': 'накърмя',
  'upos': 'VERB',
  'xpos': 'Vpptf-r1s',
  'feats': 'Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|Voice=Act'},
 {'id': 10,
  'text': 'сина',
  'upos': 'NOUN',
  'xpos': 'Ncmsh',
  'feats': 'Definite=Def|Gender=Masc|Number=Sing'},
 {'id': 11,
  'text': 'си',
  'upos

In [23]:
sent2features(data[0])

[{'word': b'\xd0\x98',
  'sent_len': 36,
  'pos_in_sent': 0,
  'upos': 'CCONJ',
  'xpos': 'Cp',
  'first_word_in_sent': b'\xd0\x98',
  'BOS': True,
  'next_word': b'\xd0\xba\xd0\xbe\xd0\xb3\xd0\xb0\xd1\x82\xd0\xbe'},
 {'word': b'\xd0\xba\xd0\xbe\xd0\xb3\xd0\xb0\xd1\x82\xd0\xbe',
  'sent_len': 36,
  'pos_in_sent': 1,
  'upos': 'ADV',
  'xpos': 'Prt',
  'first_word_in_sent': b'\xd0\x98',
  'prev_word': b'\xd0\x98',
  'next_word': b'\xd1\x81\xd1\x82\xd0\xb0\xd0\xbd\xd0\xb0\xd1\x85'},
 {'word': b'\xd1\x81\xd1\x82\xd0\xb0\xd0\xbd\xd0\xb0\xd1\x85',
  'sent_len': 36,
  'pos_in_sent': 2,
  'upos': 'VERB',
  'xpos': 'Vppif-o1s',
  'first_word_in_sent': b'\xd0\x98',
  'prev_word': b'\xd0\xba\xd0\xbe\xd0\xb3\xd0\xb0\xd1\x82\xd0\xbe',
  'next_word': b'\xd0\xbd\xd0\xb0'},
 {'word': b'\xd0\xbd\xd0\xb0',
  'sent_len': 36,
  'pos_in_sent': 3,
  'upos': 'ADP',
  'xpos': 'R',
  'first_word_in_sent': b'\xd0\x98',
  'prev_word': b'\xd1\x81\xd1\x82\xd0\xb0\xd0\xbd\xd0\xb0\xd1\x85',
  'next_word': b'\xd1\x8

In [24]:
sent2labels(data[0])

['',
 '',
 '',
 '',
 b',',
 '',
 '',
 '',
 '',
 '',
 b',',
 '',
 b',',
 '',
 '',
 '',
 b';',
 '',
 '',
 '',
 '',
 '',
 '',
 b',',
 '',
 b',',
 '',
 '',
 '',
 '',
 b',',
 '',
 '',
 '',
 b'.',
 '']

In [44]:
%%time
X_train = [sent2features(sent) for sent in data]
y_train = [sent2labels(sent) for sent in data]

CPU times: user 130 ms, sys: 12.1 ms, total: 142 ms
Wall time: 140 ms


In [45]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 10.9 s, sys: 3.25 ms, total: 10.9 s
Wall time: 10.9 s


In [46]:
crf.classes_

UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 0: ordinal not in range(128)