### Function Definitions

In [1]:
from ipynb.fs.full.utilities import *

### CRF Model

### Sentence contains imperative verb

In [23]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)
    sentence_contains_imperative_verb, imperative_verb_position = contains_imperative_verb(sentence)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position,
        'contains_imperative_verb': sentence_contains_imperative_verb,
        'imperative_verb_position': imperative_verb_position
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos']
        })
    else:
        features.update({
            'BOS': True
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos']
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos']
        })

    return features

In [24]:
%%time
X_train, y_train = data_prep('../../data/Train.txt')

2021-09-29 00:35:06 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 00:35:06 INFO: Use device: gpu
2021-09-29 00:35:06 INFO: Loading: tokenize
2021-09-29 00:35:06 INFO: Done loading processors!
2021-09-29 00:35:30 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 00:35:30 INFO: Use device: gpu
2021-09-29 00:35:30 INFO: Loading: tokenize
2021-09-29 00:35:30 INFO: Loading: pos
2021-09-29 00:35:36 INFO: Done loading processors!


CPU times: user 23min 14s, sys: 3.57 s, total: 23min 18s
Wall time: 23min 17s


In [25]:
verify_prepped_data(X_train, y_train)

In [26]:
%%time
X_dev, y_dev = data_prep('../../data/Dev.txt')

2021-09-29 00:58:24 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 00:58:24 INFO: Use device: gpu
2021-09-29 00:58:24 INFO: Loading: tokenize
2021-09-29 00:58:24 INFO: Done loading processors!
2021-09-29 00:58:37 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 00:58:37 INFO: Use device: gpu
2021-09-29 00:58:37 INFO: Loading: tokenize
2021-09-29 00:58:37 INFO: Loading: pos
2021-09-29 00:58:38 INFO: Done loading processors!


CPU times: user 11min 49s, sys: 1.09 s, total: 11min 50s
Wall time: 11min 49s


In [27]:
verify_prepped_data(X_dev, y_dev)

In [28]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 29min 3s, sys: 1.64 s, total: 29min 4s
Wall time: 29min 1s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Sentence contains relative pronoun

In [30]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)
    sentence_contains_relative_pronoun, relative_pronoun_position = contains_relative_pronoun(sentence)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position,
        'contains_relative_pronoun': sentence_contains_relative_pronoun,
        'relative_pronoun_position': relative_pronoun_position
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos']
        })
    else:
        features.update({
            'BOS': True
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos']
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos']
        })

    return features

In [31]:
%%time
X_train, y_train = data_prep('../../data/Train.txt')

2021-09-29 01:39:26 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 01:39:26 INFO: Use device: gpu
2021-09-29 01:39:26 INFO: Loading: tokenize
2021-09-29 01:39:26 INFO: Done loading processors!
2021-09-29 01:39:44 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 01:39:44 INFO: Use device: gpu
2021-09-29 01:39:44 INFO: Loading: tokenize
2021-09-29 01:39:44 INFO: Loading: pos
2021-09-29 01:39:45 INFO: Done loading processors!


CPU times: user 23min 16s, sys: 2.07 s, total: 23min 18s
Wall time: 23min 17s


In [32]:
verify_prepped_data(X_train, y_train)

In [33]:
%%time
X_dev, y_dev = data_prep('../../data/Dev.txt')

2021-09-29 02:02:44 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 02:02:44 INFO: Use device: gpu
2021-09-29 02:02:44 INFO: Loading: tokenize
2021-09-29 02:02:44 INFO: Done loading processors!
2021-09-29 02:02:56 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 02:02:56 INFO: Use device: gpu
2021-09-29 02:02:56 INFO: Loading: tokenize
2021-09-29 02:02:56 INFO: Loading: pos
2021-09-29 02:02:58 INFO: Done loading processors!


CPU times: user 11min 47s, sys: 1.06 s, total: 11min 48s
Wall time: 11min 47s


In [34]:
verify_prepped_data(X_dev, y_dev)

In [35]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 30min 24s, sys: 1.36 s, total: 30min 25s
Wall time: 30min 22s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Contains repetitive coordinative conjunction

In [24]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)
    sentence_contains_repetitive_coord_conj, repetitive_coord_conj_position = contains_repetitive_coord_conj_before(sentence, i)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position,
        'contains_repetitive_coord_conj_before': sentence_contains_repetitive_coord_conj,
        'repetitive_coord_conj_position': repetitive_coord_conj_position
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos']
        })
    else:
        features.update({
            'BOS': True
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos']
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos']
        })

    return features

In [25]:
%%time
X_train, y_train = data_prep('../../data/Train.txt')

2021-09-29 09:46:52 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 09:46:52 INFO: Use device: gpu
2021-09-29 09:46:52 INFO: Loading: tokenize
2021-09-29 09:46:52 INFO: Done loading processors!
2021-09-29 09:47:14 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 09:47:14 INFO: Use device: gpu
2021-09-29 09:47:14 INFO: Loading: tokenize
2021-09-29 09:47:14 INFO: Loading: pos
2021-09-29 09:47:16 INFO: Done loading processors!


CPU times: user 23min 4s, sys: 3.16 s, total: 23min 7s
Wall time: 23min 6s


In [26]:
verify_prepped_data(X_train, y_train)

In [27]:
%%time
X_dev, y_dev = data_prep('../../data/Dev.txt')

2021-09-29 10:09:58 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 10:09:58 INFO: Use device: gpu
2021-09-29 10:09:58 INFO: Loading: tokenize
2021-09-29 10:09:58 INFO: Done loading processors!
2021-09-29 10:10:11 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 10:10:11 INFO: Use device: gpu
2021-09-29 10:10:11 INFO: Loading: tokenize
2021-09-29 10:10:11 INFO: Loading: pos
2021-09-29 10:10:13 INFO: Done loading processors!


CPU times: user 11min 46s, sys: 1.33 s, total: 11min 48s
Wall time: 11min 47s


In [28]:
verify_prepped_data(X_dev, y_dev)

In [29]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 28min 59s, sys: 1.67 s, total: 29min 1s
Wall time: 28min 57s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Add more context of words before

In [31]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos']
        })
    else:
        features.update({
            'BOS': True
        })

    if i > 1:
        features.update({
            'word_before_prev_word': sentence[i-2]['text'],
            'word_before_prev_word_upos': sentence[i-2]['upos'],
            'word_before_prev_word_xpos': sentence[i-2]['xpos']
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos']
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos']
        })

    return features

In [32]:
%%time
X_train, y_train = data_prep('../../data/Train.txt')

2021-09-29 10:50:57 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 10:50:57 INFO: Use device: gpu
2021-09-29 10:50:57 INFO: Loading: tokenize
2021-09-29 10:50:57 INFO: Done loading processors!
2021-09-29 10:51:14 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 10:51:14 INFO: Use device: gpu
2021-09-29 10:51:14 INFO: Loading: tokenize
2021-09-29 10:51:14 INFO: Loading: pos
2021-09-29 10:51:15 INFO: Done loading processors!


CPU times: user 23min 14s, sys: 2.34 s, total: 23min 16s
Wall time: 23min 15s


In [33]:
verify_prepped_data(X_train, y_train)

In [34]:
%%time
X_dev, y_dev = data_prep('../../data/Dev.txt')

2021-09-29 11:14:12 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 11:14:12 INFO: Use device: gpu
2021-09-29 11:14:12 INFO: Loading: tokenize
2021-09-29 11:14:12 INFO: Done loading processors!
2021-09-29 11:14:25 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 11:14:25 INFO: Use device: gpu
2021-09-29 11:14:25 INFO: Loading: tokenize
2021-09-29 11:14:25 INFO: Loading: pos
2021-09-29 11:14:27 INFO: Done loading processors!


CPU times: user 11min 47s, sys: 1.13 s, total: 11min 48s
Wall time: 11min 47s


In [35]:
verify_prepped_data(X_dev, y_dev)

In [36]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 33min 15s, sys: 2.07 s, total: 33min 18s
Wall time: 33min 13s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

In [39]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos']
        })
    else:
        features.update({
            'BOS': True
        })

    if i > 1:
        features.update({
            'word_before_prev_word': sentence[i-2]['text'],
            'word_before_prev_word_upos': sentence[i-2]['upos'],
            'word_before_prev_word_xpos': sentence[i-2]['xpos']
        })

    if i > 2:
        features.update({
            '2_words_before_prev_word': sentence[i-3]['text'],
            '2_words_before_prev_word_upos': sentence[i-3]['upos'],
            '2_words_before_prev_word_xpos': sentence[i-3]['xpos']
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos']
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos']
        })

    return features

In [40]:
%%time
X_train, y_train = data_prep('../../data/Train.txt')

2021-09-29 12:16:15 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 12:16:15 INFO: Use device: gpu
2021-09-29 12:16:15 INFO: Loading: tokenize
2021-09-29 12:16:15 INFO: Done loading processors!
2021-09-29 12:16:32 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 12:16:32 INFO: Use device: gpu
2021-09-29 12:16:32 INFO: Loading: tokenize
2021-09-29 12:16:32 INFO: Loading: pos
2021-09-29 12:16:33 INFO: Done loading processors!


CPU times: user 23min 6s, sys: 2.15 s, total: 23min 9s
Wall time: 23min 8s


In [41]:
verify_prepped_data(X_train, y_train)

In [42]:
%%time
X_dev, y_dev = data_prep('../../data/Dev.txt')

2021-09-29 12:39:24 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 12:39:24 INFO: Use device: gpu
2021-09-29 12:39:24 INFO: Loading: tokenize
2021-09-29 12:39:24 INFO: Done loading processors!
2021-09-29 12:39:36 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 12:39:36 INFO: Use device: gpu
2021-09-29 12:39:36 INFO: Loading: tokenize
2021-09-29 12:39:36 INFO: Loading: pos
2021-09-29 12:39:38 INFO: Done loading processors!


CPU times: user 11min 58s, sys: 952 ms, total: 11min 59s
Wall time: 11min 58s


In [43]:
verify_prepped_data(X_dev, y_dev)

In [44]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 32min 9s, sys: 3.06 s, total: 32min 12s
Wall time: 32min 8s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Add more context of words after

In [46]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos']
        })
    else:
        features.update({
            'BOS': True
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos']
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos']
        })

    if i < len(sentence)-3:
        features.update({
            '2_words_after_next_word': sentence [i+3]['text'],
            '2_words_after_next_word_upos': sentence[i+3]['upos'],
            '2_words_after_next_word_xpos': sentence[i+3]['xpos']
        })

    return features

In [47]:
%%time
X_train, y_train = data_prep('../../data/Train.txt')

2021-09-29 13:23:51 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 13:23:51 INFO: Use device: gpu
2021-09-29 13:23:51 INFO: Loading: tokenize
2021-09-29 13:23:51 INFO: Done loading processors!
2021-09-29 13:24:17 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 13:24:17 INFO: Use device: gpu
2021-09-29 13:24:17 INFO: Loading: tokenize
2021-09-29 13:24:17 INFO: Loading: pos
2021-09-29 13:24:18 INFO: Done loading processors!


CPU times: user 24min 15s, sys: 2.66 s, total: 24min 17s
Wall time: 24min 16s


In [48]:
verify_prepped_data(X_train, y_train)

In [49]:
%%time
X_dev, y_dev = data_prep('../../data/Dev.txt')

2021-09-29 13:48:07 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 13:48:07 INFO: Use device: gpu
2021-09-29 13:48:07 INFO: Loading: tokenize
2021-09-29 13:48:07 INFO: Done loading processors!
2021-09-29 13:48:20 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 13:48:20 INFO: Use device: gpu
2021-09-29 13:48:20 INFO: Loading: tokenize
2021-09-29 13:48:20 INFO: Loading: pos
2021-09-29 13:48:22 INFO: Done loading processors!


CPU times: user 12min, sys: 896 ms, total: 12min 1s
Wall time: 12min


In [50]:
verify_prepped_data(X_dev, y_dev)

In [51]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 29min 5s, sys: 1.5 s, total: 29min 7s
Wall time: 29min 3s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Check if word starts with capital letter

In [53]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position,
        'starts_with_capital_letter': sentence[i]['text'][0].isupper()
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos'],
            'prev_word_starts_with_capital_letter': sentence[i-1]['text'][0].isupper()
        })
    else:
        features.update({
            'BOS': True
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos'],
            'next_word_starts_with_capital_letter': sentence[i+1]['text'][0].isupper()
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos'],
            'word_after_next_word_starts_with_capital_letter': sentence[i+2]['text'][0].isupper()
        })

    return features

In [54]:
%%time
X_train, y_train = data_prep('../../data/Train.txt')

2021-09-29 14:54:36 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 14:54:36 INFO: Use device: gpu
2021-09-29 14:54:36 INFO: Loading: tokenize
2021-09-29 14:54:36 INFO: Done loading processors!
2021-09-29 14:54:52 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 14:54:52 INFO: Use device: gpu
2021-09-29 14:54:52 INFO: Loading: tokenize
2021-09-29 14:54:52 INFO: Loading: pos
2021-09-29 14:54:53 INFO: Done loading processors!


CPU times: user 23min 40s, sys: 2.24 s, total: 23min 42s
Wall time: 23min 41s


In [55]:
verify_prepped_data(X_train, y_train)

In [56]:
%%time
X_dev, y_dev = data_prep('../../data/Dev.txt')

2021-09-29 15:18:18 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 15:18:18 INFO: Use device: gpu
2021-09-29 15:18:18 INFO: Loading: tokenize
2021-09-29 15:18:18 INFO: Done loading processors!
2021-09-29 15:18:30 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 15:18:30 INFO: Use device: gpu
2021-09-29 15:18:30 INFO: Loading: tokenize
2021-09-29 15:18:30 INFO: Loading: pos
2021-09-29 15:18:32 INFO: Done loading processors!


CPU times: user 11min 55s, sys: 996 ms, total: 11min 56s
Wall time: 11min 55s


In [57]:
verify_prepped_data(X_dev, y_dev)

In [58]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 30min 15s, sys: 1.28 s, total: 30min 16s
Wall time: 30min 13s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Contains relative pronoun before

In [77]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)
    sentence_contains_relative_pronoun_before, relative_pronoun_position = contains_relative_pronoun_before(sentence, i)
    
    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position,
        'contains_relative_pronoun_before': sentence_contains_relative_pronoun_before,
        'relative_pronoun_position': relative_pronoun_position
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos']
        })
    else:
        features.update({
            'BOS': True
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos']
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos']
        })

    return features

In [78]:
%%time
X_train, y_train = data_prep('../../data/Train.txt')

2021-09-29 20:23:44 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 20:23:44 INFO: Use device: gpu
2021-09-29 20:23:44 INFO: Loading: tokenize
2021-09-29 20:23:44 INFO: Done loading processors!
2021-09-29 20:24:01 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 20:24:01 INFO: Use device: gpu
2021-09-29 20:24:01 INFO: Loading: tokenize
2021-09-29 20:24:01 INFO: Loading: pos
2021-09-29 20:24:01 INFO: Done loading processors!


CPU times: user 24min 45s, sys: 3.38 s, total: 24min 48s
Wall time: 24min 47s


In [79]:
verify_prepped_data(X_train, y_train)

In [80]:
%%time
X_dev, y_dev = data_prep('../../data/Dev.txt')

2021-09-29 20:48:32 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 20:48:32 INFO: Use device: gpu
2021-09-29 20:48:32 INFO: Loading: tokenize
2021-09-29 20:48:32 INFO: Done loading processors!
2021-09-29 20:48:45 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 20:48:45 INFO: Use device: gpu
2021-09-29 20:48:45 INFO: Loading: tokenize
2021-09-29 20:48:45 INFO: Loading: pos
2021-09-29 20:48:47 INFO: Done loading processors!


CPU times: user 11min 57s, sys: 1.15 s, total: 11min 58s
Wall time: 11min 57s


In [81]:
verify_prepped_data(X_dev, y_dev)

In [82]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 38min 32s, sys: 2.04 s, total: 38min 34s
Wall time: 38min 28s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Contains conjunction before

In [84]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)
    sentence_contains_conj_before, conj_position = contains_conj_before(sentence, i)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position,
        'contains_conj_before': sentence_contains_conj_before,
        'conj_position': conj_position
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos']
        })
    else:
        features.update({
            'BOS': True
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos']
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos']
        })

    return features

In [85]:
%%time
X_train, y_train = data_prep('../../data/Train.txt')

2021-09-29 21:39:18 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 21:39:18 INFO: Use device: gpu
2021-09-29 21:39:18 INFO: Loading: tokenize
2021-09-29 21:39:18 INFO: Done loading processors!
2021-09-29 21:39:44 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 21:39:44 INFO: Use device: gpu
2021-09-29 21:39:44 INFO: Loading: tokenize
2021-09-29 21:39:44 INFO: Loading: pos
2021-09-29 21:39:46 INFO: Done loading processors!


CPU times: user 24min 54s, sys: 2.29 s, total: 24min 56s
Wall time: 24min 56s


In [86]:
verify_prepped_data(X_train, y_train)

In [87]:
%%time
X_dev, y_dev = data_prep('../../data/Dev.txt')

2021-09-29 22:04:14 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 22:04:14 INFO: Use device: gpu
2021-09-29 22:04:14 INFO: Loading: tokenize
2021-09-29 22:04:14 INFO: Done loading processors!
2021-09-29 22:04:26 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 22:04:26 INFO: Use device: gpu
2021-09-29 22:04:26 INFO: Loading: tokenize
2021-09-29 22:04:26 INFO: Loading: pos
2021-09-29 22:04:28 INFO: Done loading processors!


CPU times: user 12min 34s, sys: 1.11 s, total: 12min 35s
Wall time: 12min 34s


In [88]:
verify_prepped_data(X_dev, y_dev)

In [89]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 41min 52s, sys: 3.25 s, total: 41min 55s
Wall time: 41min 49s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Model combining the best features

In [91]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)
    sentence_contains_imperative_verb, imperative_verb_position = contains_imperative_verb(sentence)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position,
        'contains_imperative_verb': sentence_contains_imperative_verb,
        'imperative_verb_position': imperative_verb_position,
        'starts_with_capital_letter': sentence[i]['text'][0].isupper()
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos'],
            'prev_word_starts_with_capital_letter': sentence[i-1]['text'][0].isupper()
        })
    else:
        features.update({
            'BOS': True
        })

    if i > 1:
        features.update({
            'word_before_prev_word': sentence[i-2]['text'],
            'word_before_prev_word_upos': sentence[i-2]['upos'],
            'word_before_prev_word_xpos': sentence[i-2]['xpos'],
            'word_before_prev_word_starts_with_capital_letter': sentence[i-2]['text'][0].isupper()
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos'],
            'next_word_starts_with_capital_letter': sentence[i+1]['text'][0].isupper()
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos'],
            'word_after_next_word_starts_with_capital_letter': sentence[i+2]['text'][0].isupper()
        })

    return features

In [92]:
%%time
X_train, y_train = data_prep('../../data/Train.txt')

2021-09-29 23:07:29 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 23:07:29 INFO: Use device: gpu
2021-09-29 23:07:29 INFO: Loading: tokenize
2021-09-29 23:07:29 INFO: Done loading processors!
2021-09-29 23:07:52 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 23:07:52 INFO: Use device: gpu
2021-09-29 23:07:52 INFO: Loading: tokenize
2021-09-29 23:07:52 INFO: Loading: pos
2021-09-29 23:07:54 INFO: Done loading processors!


CPU times: user 24min 40s, sys: 2.26 s, total: 24min 43s
Wall time: 24min 42s


In [93]:
verify_prepped_data(X_train, y_train)

In [94]:
%%time
X_dev, y_dev = data_prep('../../data/Dev.txt')

2021-09-29 23:32:11 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 23:32:11 INFO: Use device: gpu
2021-09-29 23:32:11 INFO: Loading: tokenize
2021-09-29 23:32:11 INFO: Done loading processors!
2021-09-29 23:32:24 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 23:32:24 INFO: Use device: gpu
2021-09-29 23:32:24 INFO: Loading: tokenize
2021-09-29 23:32:24 INFO: Loading: pos
2021-09-29 23:32:26 INFO: Done loading processors!


CPU times: user 12min 27s, sys: 1.1 s, total: 12min 28s
Wall time: 12min 27s


In [95]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 45min 25s, sys: 2.87 s, total: 45min 28s
Wall time: 45min 22s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

In [97]:
save_as_json(X_dev, '../../data/X_dev.json')

In [99]:
save_as_json(y_dev, '../../data/y_dev.json')

### Expand train set

In [21]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)
    sentence_contains_imperative_verb, imperative_verb_position = contains_imperative_verb(sentence)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position,
        'contains_imperative_verb': sentence_contains_imperative_verb,
        'imperative_verb_position': imperative_verb_position,
        'starts_with_capital_letter': sentence[i]['text'][0].isupper()
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos'],
            'prev_word_starts_with_capital_letter': sentence[i-1]['text'][0].isupper()
        })
    else:
        features.update({
            'BOS': True
        })

    if i > 1:
        features.update({
            'word_before_prev_word': sentence[i-2]['text'],
            'word_before_prev_word_upos': sentence[i-2]['upos'],
            'word_before_prev_word_xpos': sentence[i-2]['xpos'],
            'word_before_prev_word_starts_with_capital_letter': sentence[i-2]['text'][0].isupper()
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos'],
            'next_word_starts_with_capital_letter': sentence[i+1]['text'][0].isupper()
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos'],
            'word_after_next_word_starts_with_capital_letter': sentence[i+2]['text'][0].isupper()
        })

    return features

In [22]:
X_dev = load_json('../../data/X_dev.json')

In [23]:
y_dev = load_json('../../data/y_dev.json')

In [24]:
verify_prepped_data(X_dev, y_dev)

### Expand train set to 30% of data

In [26]:
%%time
X_train, y_train = data_prep('../../data/Train_30.txt')

2021-09-30 01:15:42 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-30 01:15:42 INFO: Use device: gpu
2021-09-30 01:15:42 INFO: Loading: tokenize
2021-09-30 01:15:42 INFO: Done loading processors!
2021-09-30 01:16:13 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-30 01:16:13 INFO: Use device: gpu
2021-09-30 01:16:13 INFO: Loading: tokenize
2021-09-30 01:16:13 INFO: Loading: pos
2021-09-30 01:16:18 INFO: Done loading processors!


CPU times: user 33min 54s, sys: 4.56 s, total: 33min 58s
Wall time: 33min 58s


In [27]:
verify_prepped_data(X_train, y_train)

In [28]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 58min 38s, sys: 2.54 s, total: 58min 41s
Wall time: 58min 34s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Expand train set to 50% of data

In [30]:
%%time
X_train, y_train = data_prep('../../data/Train_50.txt')

2021-09-30 02:48:30 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-30 02:48:30 INFO: Use device: gpu
2021-09-30 02:48:30 INFO: Loading: tokenize
2021-09-30 02:48:30 INFO: Done loading processors!
2021-09-30 02:49:11 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-30 02:49:11 INFO: Use device: gpu
2021-09-30 02:49:11 INFO: Loading: tokenize
2021-09-30 02:49:11 INFO: Loading: pos
2021-09-30 02:49:12 INFO: Done loading processors!


CPU times: user 57min 28s, sys: 6.39 s, total: 57min 34s
Wall time: 57min 32s


In [31]:
verify_prepped_data(X_train, y_train)

In [32]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 1h 55min 54s, sys: 3.86 s, total: 1h 55min 58s
Wall time: 1h 55min 45s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Expand train set to 60% of data

In [34]:
%%time
X_train, y_train = data_prep('../../data/Train_60.txt')

2021-09-30 05:42:06 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-30 05:42:06 INFO: Use device: gpu
2021-09-30 05:42:06 INFO: Loading: tokenize
2021-09-30 05:42:06 INFO: Done loading processors!
2021-09-30 05:42:55 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-30 05:42:55 INFO: Use device: gpu
2021-09-30 05:42:55 INFO: Loading: tokenize
2021-09-30 05:42:55 INFO: Loading: pos
2021-09-30 05:42:57 INFO: Done loading processors!


CPU times: user 1h 9min 9s, sys: 6.31 s, total: 1h 9min 15s
Wall time: 1h 9min 14s


In [35]:
verify_prepped_data(X_train, y_train)

In [36]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 3h 2min 3s, sys: 6.58 s, total: 3h 2min 9s
Wall time: 3h 1min 49s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Expand train set to 70% of data

In [25]:
%%time
X_train, y_train = data_prep('../../data/Train_70.txt')

2021-09-30 12:02:25 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-30 12:02:25 INFO: Use device: gpu
2021-09-30 12:02:25 INFO: Loading: tokenize
2021-09-30 12:02:25 INFO: Done loading processors!
2021-09-30 12:03:25 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-30 12:03:25 INFO: Use device: gpu
2021-09-30 12:03:25 INFO: Loading: tokenize
2021-09-30 12:03:25 INFO: Loading: pos
2021-09-30 12:03:28 INFO: Done loading processors!


CPU times: user 1h 20min 57s, sys: 8.48 s, total: 1h 21min 6s
Wall time: 1h 21min 4s


In [26]:
verify_prepped_data(X_train, y_train)

In [27]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 3h 20min 38s, sys: 7.05 s, total: 3h 20min 45s
Wall time: 3h 20min 24s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Check for repetitive word before

In [25]:
%%time
X_train, y_train = data_prep('../../data/Train.txt')

2021-09-30 20:21:14 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-30 20:21:14 INFO: Use device: gpu
2021-09-30 20:21:14 INFO: Loading: tokenize
2021-09-30 20:21:14 INFO: Done loading processors!
2021-09-30 20:21:39 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-30 20:21:39 INFO: Use device: gpu
2021-09-30 20:21:39 INFO: Loading: tokenize
2021-09-30 20:21:39 INFO: Loading: pos
2021-09-30 20:21:41 INFO: Done loading processors!


CPU times: user 23min 56s, sys: 3.09 s, total: 23min 59s
Wall time: 23min 58s


In [27]:
verify_prepped_data(X_train, y_train)

In [28]:
%%time
X_dev, y_dev = data_prep('../../data/Dev.txt')

2021-09-30 20:49:20 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-30 20:49:20 INFO: Use device: gpu
2021-09-30 20:49:20 INFO: Loading: tokenize
2021-09-30 20:49:20 INFO: Done loading processors!
2021-09-30 20:49:33 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-30 20:49:33 INFO: Use device: gpu
2021-09-30 20:49:33 INFO: Loading: tokenize
2021-09-30 20:49:33 INFO: Loading: pos
2021-09-30 20:49:35 INFO: Done loading processors!


CPU times: user 11min 37s, sys: 1.31 s, total: 11min 38s
Wall time: 11min 38s


In [29]:
verify_prepped_data(X_dev, y_dev)

In [30]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 37min 46s, sys: 2.11 s, total: 37min 48s
Wall time: 37min 44s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Check for number of verbs before

In [74]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)
    sentence_contains_imperative_verb, imperative_verb_position = contains_imperative_verb(sentence)
    sentence_contains_repetitive_word_before, repetitive_word_tag = contains_repetitive_word_before(sentence, i)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position,
        'contains_imperative_verb': sentence_contains_imperative_verb,
        'imperative_verb_position': imperative_verb_position,
        'starts_with_capital_letter': sentence[i]['text'][0].isupper(),
        'contains_repetitive_word_before': sentence_contains_repetitive_word_before,
        'repetitive_word_tag': repetitive_word_tag,
        'count_of_verbs_before': count_of_verbs_before(sentence, i)
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos'],
            'prev_word_starts_with_capital_letter': sentence[i-1]['text'][0].isupper()
        })
    else:
        features.update({
            'BOS': True
        })

    if i > 1:
        features.update({
            'word_before_prev_word': sentence[i-2]['text'],
            'word_before_prev_word_upos': sentence[i-2]['upos'],
            'word_before_prev_word_xpos': sentence[i-2]['xpos'],
            'word_before_prev_word_starts_with_capital_letter': sentence[i-2]['text'][0].isupper()
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos'],
            'next_word_starts_with_capital_letter': sentence[i+1]['text'][0].isupper()
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos'],
            'word_after_next_word_starts_with_capital_letter': sentence[i+2]['text'][0].isupper()
        })

    return features

In [75]:
%%time
X_train, y_train = data_prep('../../data/Train.txt')

2021-09-30 22:35:58 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-30 22:35:58 INFO: Use device: gpu
2021-09-30 22:35:58 INFO: Loading: tokenize
2021-09-30 22:35:58 INFO: Done loading processors!
2021-09-30 22:36:16 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-30 22:36:16 INFO: Use device: gpu
2021-09-30 22:36:16 INFO: Loading: tokenize
2021-09-30 22:36:16 INFO: Loading: pos
2021-09-30 22:36:18 INFO: Done loading processors!


CPU times: user 24min 6s, sys: 2.42 s, total: 24min 8s
Wall time: 24min 7s


In [76]:
verify_prepped_data(X_train, y_train)

In [77]:
%%time
X_dev, y_dev = data_prep('../../data/Dev.txt')

2021-09-30 23:00:06 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-30 23:00:06 INFO: Use device: gpu
2021-09-30 23:00:06 INFO: Loading: tokenize
2021-09-30 23:00:06 INFO: Done loading processors!
2021-09-30 23:00:19 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-30 23:00:19 INFO: Use device: gpu
2021-09-30 23:00:19 INFO: Loading: tokenize
2021-09-30 23:00:19 INFO: Loading: pos
2021-09-30 23:00:21 INFO: Done loading processors!


CPU times: user 11min 58s, sys: 1.18 s, total: 11min 59s
Wall time: 11min 59s


In [78]:
verify_prepped_data(X_dev, y_dev)

In [79]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 46min 27s, sys: 2.71 s, total: 46min 30s
Wall time: 46min 24s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Check results with some features excluded

In [23]:
X_train = load_json('../../data/X_train.json')

In [24]:
y_train = load_json('../../data/y_train.json')

In [25]:
X_dev = load_json('../../data/X_dev.json')

In [26]:
y_dev = load_json('../../data/y_dev.json')

In [28]:
X_dev_without_first_word_in_sent = [[{feature:word[feature] for feature in word if feature not in ('first_word_in_sent', 'count_of_verbs_before')} for word in sentence] for sentence in X_dev]
X_dev_without_sent_len = [[{feature:word[feature] for feature in word if feature not in ('sent_len', 'count_of_verbs_before')} for word in sentence] for sentence in X_dev]
X_dev_without_pos = [[{feature:word[feature] for feature in word if feature not in ('pos_in_sent', 'interrogative_word_position', 'interrogative_particle_position', 'imperative_verb_position', 'count_of_verbs_before')} for word in sentence] for sentence in X_dev]
X_dev_without_upos = [[{feature:word[feature] for feature in word if feature not in ('upos', 'prev_word_upos', 'word_before_prev_word_upos', 'next_word_upos', 'word_after_next_word_upos', 'count_of_verbs_before')} for word in sentence] for sentence in X_dev]

In [29]:
X_train_without_first_word_in_sent = [[{feature:word[feature] for feature in word if feature not in ('first_word_in_sent', 'count_of_verbs_before')} for word in sentence] for sentence in X_train]
X_train_without_sent_len = [[{feature:word[feature] for feature in word if feature not in ('sent_len', 'count_of_verbs_before')} for word in sentence] for sentence in X_train]
X_train_without_pos = [[{feature:word[feature] for feature in word if feature not in ('pos_in_sent', 'interrogative_word_position', 'interrogative_particle_position', 'imperative_verb_position', 'count_of_verbs_before')} for word in sentence] for sentence in X_train]
X_train_without_upos = [[{feature:word[feature] for feature in word if feature not in ('upos', 'prev_word_upos', 'word_before_prev_word_upos', 'next_word_upos', 'word_after_next_word_upos', 'count_of_verbs_before')} for word in sentence] for sentence in X_train]

In [38]:
verify_prepped_data(X_dev_without_first_word_in_sent, y_dev)

In [39]:
verify_prepped_data(X_dev_without_sent_len, y_dev)

In [41]:
verify_prepped_data(X_dev_without_pos, y_dev)

In [42]:
verify_prepped_data(X_dev_without_upos, y_dev)

In [43]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train_without_first_word_in_sent, y_train)

CPU times: user 30min 59s, sys: 999 ms, total: 31min
Wall time: 30min 58s


In [44]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev_without_first_word_in_sent)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev_without_first_word_in_sent, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.9390237683618073


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.959     0.989     0.974    227825
           :      0.893     0.713     0.793       923
           !      0.429     0.156     0.228       347
           ,      0.799     0.639     0.710     18592
           ?      0.828     0.630     0.715       443
           ;      0.221     0.037     0.063       622
           .      0.931     0.986     0.958     11401
           -      0.563     0.111     0.186       882
          ,(      0.000     0.000     0.000         5
           "      0.636     0.140     0.230      1596
         ").      0.000     0.000     0.000         0
           (      0.965     0.883     0.922      1656
           )      0.880     0.846     0.863      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.636    



              precision    recall  f1-score   support

           :      0.893     0.713     0.793       923
           !      0.429     0.156     0.228       347
           ,      0.799     0.639     0.710     18592
           ?      0.828     0.630     0.715       443
           ;      0.221     0.037     0.063       622
           .      0.931     0.986     0.958     11401
           -      0.563     0.111     0.186       882
          ,(      0.000     0.000     0.000         5
           "      0.636     0.140     0.230      1596
         ").      0.000     0.000     0.000         0
           (      0.965     0.883     0.922      1656
           )      0.880     0.846     0.863      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.636     0.104     0.179        67
          ."      0.167    

In [45]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train_without_sent_len, y_train)

CPU times: user 28min 13s, sys: 924 ms, total: 28min 14s
Wall time: 28min 11s


In [46]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev_without_sent_len)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev_without_sent_len, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.9389738459216473


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.959     0.989     0.974    227825
           :      0.900     0.731     0.807       923
           !      0.489     0.184     0.268       347
           ,      0.797     0.642     0.711     18592
           ?      0.852     0.677     0.755       443
           ;      0.234     0.042     0.071       622
           .      0.933     0.987     0.959     11401
           -      0.547     0.100     0.169       882
          ,(      0.000     0.000     0.000         5
           "      0.633     0.130     0.215      1596
         ").      0.000     0.000     0.000         0
           (      0.970     0.876     0.920      1656
           )      0.879     0.838     0.858      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.769    



              precision    recall  f1-score   support

           :      0.900     0.731     0.807       923
           !      0.489     0.184     0.268       347
           ,      0.797     0.642     0.711     18592
           ?      0.852     0.677     0.755       443
           ;      0.234     0.042     0.071       622
           .      0.933     0.987     0.959     11401
           -      0.547     0.100     0.169       882
          ,(      0.000     0.000     0.000         5
           "      0.633     0.130     0.215      1596
         ").      0.000     0.000     0.000         0
           (      0.970     0.876     0.920      1656
           )      0.879     0.838     0.858      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.769     0.149     0.250        67
          ."      0.333    

In [47]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train_without_pos, y_train)

CPU times: user 27min 49s, sys: 836 ms, total: 27min 50s
Wall time: 27min 47s


In [48]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev_without_pos)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev_without_pos, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.9386445886287452


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.958     0.990     0.974    227825
           :      0.916     0.705     0.797       923
           !      0.523     0.199     0.288       347
           ,      0.807     0.630     0.708     18592
           ?      0.870     0.679     0.763       443
           ;      0.280     0.042     0.073       622
           .      0.933     0.988     0.960     11401
           -      0.591     0.100     0.171       882
          ,(      0.000     0.000     0.000         5
           "      0.620     0.128     0.212      1596
         ").      0.000     0.000     0.000         0
           (      0.973     0.872     0.920      1656
           )      0.879     0.831     0.855      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.667    



              precision    recall  f1-score   support

           :      0.916     0.705     0.797       923
           !      0.523     0.199     0.288       347
           ,      0.807     0.630     0.708     18592
           ?      0.870     0.679     0.763       443
           ;      0.280     0.042     0.073       622
           .      0.933     0.988     0.960     11401
           -      0.591     0.100     0.171       882
          ,(      0.000     0.000     0.000         5
           "      0.620     0.128     0.212      1596
         ").      0.000     0.000     0.000         0
           (      0.973     0.872     0.920      1656
           )      0.879     0.831     0.855      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.667     0.090     0.158        67
          ."      0.333    

In [49]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train_without_upos, y_train)

CPU times: user 26min 55s, sys: 721 ms, total: 26min 56s
Wall time: 26min 53s


In [50]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev_without_upos)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev_without_upos, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.9388912093385249


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.959     0.989     0.974    227825
           :      0.915     0.711     0.800       923
           !      0.522     0.202     0.291       347
           ,      0.797     0.640     0.710     18592
           ?      0.860     0.679     0.759       443
           ;      0.237     0.045     0.076       622
           .      0.934     0.988     0.960     11401
           -      0.574     0.101     0.172       882
          ,(      0.000     0.000     0.000         5
           "      0.614     0.128     0.212      1596
         ").      0.000     0.000     0.000         0
           (      0.968     0.869     0.916      1656
           )      0.874     0.829     0.851      1001
          ?-      0.143     0.250     0.182         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.700    



              precision    recall  f1-score   support

           :      0.915     0.711     0.800       923
           !      0.522     0.202     0.291       347
           ,      0.797     0.640     0.710     18592
           ?      0.860     0.679     0.759       443
           ;      0.237     0.045     0.076       622
           .      0.934     0.988     0.960     11401
           -      0.574     0.101     0.172       882
          ,(      0.000     0.000     0.000         5
           "      0.614     0.128     0.212      1596
         ").      0.000     0.000     0.000         0
           (      0.968     0.869     0.916      1656
           )      0.874     0.829     0.851      1001
          ?-      0.143     0.250     0.182         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.700     0.104     0.182        67
          ."      0.333    

### Try out a model with verb count for the whole word window

In [23]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)
    sentence_contains_imperative_verb, imperative_verb_position = contains_imperative_verb(sentence)
    sentence_contains_repetitive_word_before, repetitive_word_tag = contains_repetitive_word_before(sentence, i)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position,
        'contains_imperative_verb': sentence_contains_imperative_verb,
        'imperative_verb_position': imperative_verb_position,
        'starts_with_capital_letter': sentence[i]['text'][0].isupper(),
        'contains_repetitive_word_before': sentence_contains_repetitive_word_before,
        'repetitive_word_tag': repetitive_word_tag,
        'count_of_verbs_before': count_of_verbs_before(sentence, i)
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos'],
            'prev_word_starts_with_capital_letter': sentence[i-1]['text'][0].isupper(),
            'count_of_verbs_before_prev_word': count_of_verbs_before(sentence, i-1)
        })
    else:
        features.update({
            'BOS': True
        })

    if i > 1:
        features.update({
            'word_before_prev_word': sentence[i-2]['text'],
            'word_before_prev_word_upos': sentence[i-2]['upos'],
            'word_before_prev_word_xpos': sentence[i-2]['xpos'],
            'word_before_prev_word_starts_with_capital_letter': sentence[i-2]['text'][0].isupper(),
            'count_of_verbs_before_word_before_prev_word': count_of_verbs_before(sentence, i-2)
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos'],
            'next_word_starts_with_capital_letter': sentence[i+1]['text'][0].isupper(),
            'count_of_verbs_before_next_word': count_of_verbs_before(sentence, i+1)
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos'],
            'word_after_next_word_starts_with_capital_letter': sentence[i+2]['text'][0].isupper(),
            'count_of_verbs_before_word_after_next_word': count_of_verbs_before(sentence, i+2)
        })

    return features

In [24]:
%%time
X_train, y_train = data_prep('../../data/Train.txt')

2021-10-01 12:09:48 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-10-01 12:09:48 INFO: Use device: gpu
2021-10-01 12:09:48 INFO: Loading: tokenize
2021-10-01 12:09:48 INFO: Done loading processors!
2021-10-01 12:10:02 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-10-01 12:10:02 INFO: Use device: gpu
2021-10-01 12:10:02 INFO: Loading: tokenize
2021-10-01 12:10:02 INFO: Loading: pos
2021-10-01 12:10:05 INFO: Done loading processors!


CPU times: user 20min 40s, sys: 2.47 s, total: 20min 43s
Wall time: 20min 42s


In [25]:
verify_prepped_data(X_train, y_train)

In [26]:
%%time
X_dev, y_dev = data_prep('../../data/Dev.txt')

2021-10-01 12:30:31 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-10-01 12:30:31 INFO: Use device: gpu
2021-10-01 12:30:31 INFO: Loading: tokenize
2021-10-01 12:30:31 INFO: Done loading processors!
2021-10-01 12:30:44 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-10-01 12:30:44 INFO: Use device: gpu
2021-10-01 12:30:44 INFO: Loading: tokenize
2021-10-01 12:30:44 INFO: Loading: pos
2021-10-01 12:30:46 INFO: Done loading processors!


CPU times: user 11min 31s, sys: 1.28 s, total: 11min 32s
Wall time: 11min 31s


In [27]:
verify_prepped_data(X_dev, y_dev)

In [28]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 29min 33s, sys: 1.04 s, total: 29min 34s
Wall time: 29min 31s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Try out algorithm with higher number of max_iterations

In [23]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle = contains_interrogative_particle(sentence)
    sentence_contains_imperative_verb = contains_imperative_verb(sentence)
    sentence_contains_repetitive_word_before, repetitive_word_tag = contains_repetitive_word_before(sentence, i)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'contains_imperative_verb': sentence_contains_imperative_verb,
        'starts_with_capital_letter': sentence[i]['text'][0].isupper(),
        'contains_repetitive_word_before': sentence_contains_repetitive_word_before,
        'repetitive_word_tag': repetitive_word_tag
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos'],
            'prev_word_starts_with_capital_letter': sentence[i-1]['text'][0].isupper()
        })
    else:
        features.update({
            'BOS': True
        })

    if i > 1:
        features.update({
            'word_before_prev_word': sentence[i-2]['text'],
            'word_before_prev_word_upos': sentence[i-2]['upos'],
            'word_before_prev_word_xpos': sentence[i-2]['xpos'],
            'word_before_prev_word_starts_with_capital_letter': sentence[i-2]['text'][0].isupper()
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos'],
            'next_word_starts_with_capital_letter': sentence[i+1]['text'][0].isupper()
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos'],
            'word_after_next_word_starts_with_capital_letter': sentence[i+2]['text'][0].isupper()
        })

    return features

In [24]:
%%time
X_train, y_train = data_prep('../../data/Train.txt')

2021-10-02 12:54:53 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-10-02 12:54:53 INFO: Use device: gpu
2021-10-02 12:54:53 INFO: Loading: tokenize
2021-10-02 12:54:53 INFO: Done loading processors!
2021-10-02 12:55:10 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-10-02 12:55:10 INFO: Use device: gpu
2021-10-02 12:55:10 INFO: Loading: tokenize
2021-10-02 12:55:10 INFO: Loading: pos
2021-10-02 12:55:13 INFO: Done loading processors!


CPU times: user 21min 49s, sys: 2.81 s, total: 21min 52s
Wall time: 21min 51s


In [25]:
verify_prepped_data(X_train, y_train)

In [26]:
%%time
X_dev, y_dev = data_prep('../../data/Dev.txt')

2021-10-02 13:16:45 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-10-02 13:16:45 INFO: Use device: gpu
2021-10-02 13:16:45 INFO: Loading: tokenize
2021-10-02 13:16:45 INFO: Done loading processors!
2021-10-02 13:16:58 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-10-02 13:16:58 INFO: Use device: gpu
2021-10-02 13:16:58 INFO: Loading: tokenize
2021-10-02 13:16:58 INFO: Loading: pos
2021-10-02 13:17:00 INFO: Done loading processors!


CPU times: user 11min 32s, sys: 1.24 s, total: 11min 34s
Wall time: 11min 33s


In [27]:
verify_prepped_data(X_dev, y_dev)

In [28]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=200,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 52min 20s, sys: 2.58 s, total: 52min 23s
Wall time: 52min 17s


In [29]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels = [',','(',')','"','[',']',';','.','?','!',':','-', '...']
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.9399461413799953


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.961     0.988     0.974    227825
           :      0.899     0.739     0.811       923
           !      0.493     0.210     0.295       347
           ,      0.784     0.654     0.713     18592
           ?      0.839     0.707     0.767       443
           ;      0.235     0.056     0.091       622
           .      0.936     0.987     0.961     11401
           -      0.530     0.109     0.181       882
          ,(      0.000     0.000     0.000         5
           "      0.632     0.145     0.236      1596
         ").      0.000     0.000     0.000         0
           (      0.961     0.883     0.920      1656
           )      0.884     0.843     0.863      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      1.000     0.050     0.095        20
          :"      0.762    



              precision    recall  f1-score   support

           :      0.899     0.739     0.811       923
           !      0.493     0.210     0.295       347
           ,      0.784     0.654     0.713     18592
           ?      0.839     0.707     0.767       443
           ;      0.235     0.056     0.091       622
           .      0.936     0.987     0.961     11401
           -      0.530     0.109     0.181       882
          ,(      0.000     0.000     0.000         5
           "      0.632     0.145     0.236      1596
         ").      0.000     0.000     0.000         0
           (      0.961     0.883     0.920      1656
           )      0.884     0.843     0.863      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      1.000     0.050     0.095        20
          :"      0.762     0.239     0.364        67
          ."      0.500    



              precision    recall  f1-score   support

           ,      0.784     0.654     0.713     18592
           (      0.961     0.883     0.920      1656
           )      0.884     0.843     0.863      1001
           "      0.632     0.145     0.236      1596
           [      0.000     0.000     0.000        45
           ]      0.000     0.000     0.000        43
           ;      0.235     0.056     0.091       622
           .      0.936     0.987     0.961     11401
           ?      0.839     0.707     0.767       443
           !      0.493     0.210     0.295       347
           :      0.899     0.739     0.811       923
           -      0.530     0.109     0.181       882
         ...      0.000     0.000     0.000        61

   micro avg      0.849     0.722     0.780     37612
   macro avg      0.553     0.410     0.449     37612
weighted avg      0.817     0.722     0.755     37612

['Но синовете им не умъртви, защото постъпи според писаното в закона в книгата 

### Try out another learning algorithm

In [30]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='l2sgd',
    c2=0.1,
    max_iterations=200,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 5min 49s, sys: 220 ms, total: 5min 49s
Wall time: 5min 48s


In [31]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels = [',','(',')','"','[',']',';','.','?','!',':','-', '...']
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.9164711351052118


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.943     0.992     0.967    227825
           :      0.957     0.358     0.521       923
           !      0.000     0.000     0.000       347
           ,      0.791     0.507     0.618     18592
           ?      1.000     0.023     0.044       443
           ;      0.045     0.002     0.003       622
           .      0.866     0.990     0.924     11401
           -      0.000     0.000     0.000       882
          ,(      0.000     0.000     0.000         5
           "      0.167     0.001     0.001      1596
         ").      0.000     0.000     0.000         0
           (      0.922     0.712     0.803      1656
           )      0.793     0.670     0.727      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.000    



              precision    recall  f1-score   support

           :      0.957     0.358     0.521       923
           !      0.000     0.000     0.000       347
           ,      0.791     0.507     0.618     18592
           ?      1.000     0.023     0.044       443
           ;      0.045     0.002     0.003       622
           .      0.866     0.990     0.924     11401
           -      0.000     0.000     0.000       882
          ,(      0.000     0.000     0.000         5
           "      0.167     0.001     0.001      1596
         ").      0.000     0.000     0.000         0
           (      0.922     0.712     0.803      1656
           )      0.793     0.670     0.727      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.000     0.000     0.000        67
          ."      0.000    



              precision    recall  f1-score   support

           ,      0.791     0.507     0.618     18592
           (      0.922     0.712     0.803      1656
           )      0.793     0.670     0.727      1001
           "      0.167     0.001     0.001      1596
           [      0.000     0.000     0.000        45
           ]      0.000     0.000     0.000        43
           ;      0.045     0.002     0.003       622
           .      0.866     0.990     0.924     11401
           ?      1.000     0.023     0.044       443
           !      0.000     0.000     0.000       347
           :      0.957     0.358     0.521       923
           -      0.000     0.000     0.000       882
         ...      0.000     0.000     0.000        61

   micro avg      0.834     0.609     0.704     37612
   macro avg      0.426     0.251     0.280     37612
weighted avg      0.758     0.609     0.653     37612

['Но синовете им не умъртви, защото постъпи според писаното в закона в книгата 

In [32]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='ap',
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 16min 39s, sys: 593 ms, total: 16min 39s
Wall time: 16min 38s


In [33]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels = [',','(',')','"','[',']',';','.','?','!',':','-', '...']
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.9156361748152937


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.935     0.997     0.965    227825
           :      0.925     0.403     0.562       923
           !      0.395     0.086     0.142       347
           ,      0.896     0.444     0.594     18592
           ?      0.869     0.526     0.655       443
           ;      0.000     0.000     0.000       622
           .      0.905     0.988     0.945     11401
           -      0.000     0.000     0.000       882
          ,(      0.000     0.000     0.000         5
           "      0.250     0.001     0.001      1596
         ").      0.000     0.000     0.000         0
           (      0.969     0.643     0.773      1656
           )      0.908     0.486     0.633      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.000    



              precision    recall  f1-score   support

           :      0.925     0.403     0.562       923
           !      0.395     0.086     0.142       347
           ,      0.896     0.444     0.594     18592
           ?      0.869     0.526     0.655       443
           ;      0.000     0.000     0.000       622
           .      0.905     0.988     0.945     11401
           -      0.000     0.000     0.000       882
          ,(      0.000     0.000     0.000         5
           "      0.250     0.001     0.001      1596
         ").      0.000     0.000     0.000         0
           (      0.969     0.643     0.773      1656
           )      0.908     0.486     0.633      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.000     0.000     0.000        67
          ."      0.000    



              precision    recall  f1-score   support

           ,      0.896     0.444     0.594     18592
           (      0.969     0.643     0.773      1656
           )      0.908     0.486     0.633      1001
           "      0.250     0.001     0.001      1596
           [      0.000     0.000     0.000        45
           ]      0.000     0.000     0.000        43
           ;      0.000     0.000     0.000       622
           .      0.905     0.988     0.945     11401
           ?      0.869     0.526     0.655       443
           !      0.395     0.086     0.142       347
           :      0.925     0.403     0.562       923
           -      0.000     0.000     0.000       882
         ...      0.000     0.000     0.000        61

   micro avg      0.903     0.577     0.704     37612
   macro avg      0.471     0.275     0.331     37612
weighted avg      0.831     0.577     0.654     37612

['Но синовете им не умъртви защото постъпи според писаното в закона в книгата н

In [34]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='pa',
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 17min 11s, sys: 681 ms, total: 17min 12s
Wall time: 17min 10s


In [35]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels = [',','(',')','"','[',']',';','.','?','!',':','-', '...']
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.9342884963579355


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.964     0.979     0.971    227825
           :      0.859     0.680     0.759       923
           !      0.517     0.173     0.259       347
           ,      0.691     0.697     0.694     18592
           ?      0.861     0.700     0.772       443
           ;      0.308     0.013     0.025       622
           .      0.928     0.989     0.958     11401
           -      0.579     0.075     0.133       882
          ,(      0.000     0.000     0.000         5
           "      0.588     0.094     0.162      1596
         ").      0.000     0.000     0.000         0
           (      0.927     0.866     0.895      1656
           )      0.823     0.837     0.830      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.417    



              precision    recall  f1-score   support

           :      0.859     0.680     0.759       923
           !      0.517     0.173     0.259       347
           ,      0.691     0.697     0.694     18592
           ?      0.861     0.700     0.772       443
           ;      0.308     0.013     0.025       622
           .      0.928     0.989     0.958     11401
           -      0.579     0.075     0.133       882
          ,(      0.000     0.000     0.000         5
           "      0.588     0.094     0.162      1596
         ").      0.000     0.000     0.000         0
           (      0.927     0.866     0.895      1656
           )      0.823     0.837     0.830      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.417     0.075     0.127        67
          ."      0.000    



              precision    recall  f1-score   support

           ,      0.691     0.697     0.694     18592
           (      0.927     0.866     0.895      1656
           )      0.823     0.837     0.830      1001
           "      0.588     0.094     0.162      1596
           [      0.000     0.000     0.000        45
           ]      0.000     0.000     0.000        43
           ;      0.308     0.013     0.025       622
           .      0.928     0.989     0.958     11401
           ?      0.861     0.700     0.772       443
           !      0.517     0.173     0.259       347
           :      0.859     0.680     0.759       923
           -      0.579     0.075     0.133       882
         ...      0.000     0.000     0.000        61

   micro avg      0.791     0.737     0.763     37612
   macro avg      0.545     0.394     0.422     37612
weighted avg      0.765     0.737     0.735     37612

['Но синовете им не умъртви, защото постъпи според писаното в закона в книгата 

In [36]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='arow',
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 16min 27s, sys: 777 ms, total: 16min 28s
Wall time: 16min 26s


In [37]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels = [',','(',')','"','[',']',';','.','?','!',':','-', '...']
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.9071222729325642


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.958     0.949     0.954    227825
           :      0.601     0.618     0.609       923
           !      0.216     0.213     0.215       347
           ,      0.560     0.618     0.588     18592
           ?      0.530     0.445     0.483       443
           ;      0.083     0.096     0.089       622
           .      0.909     0.902     0.906     11401
           -      0.111     0.121     0.116       882
          ,(      0.000     0.000     0.000         5
           "      0.155     0.130     0.141      1596
         ").      0.000     0.000     0.000         0
           (      0.917     0.790     0.849      1656
           )      0.816     0.650     0.724      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.067     0.050     0.057        20
          :"      0.143    



              precision    recall  f1-score   support

           :      0.601     0.618     0.609       923
           !      0.216     0.213     0.215       347
           ,      0.560     0.618     0.588     18592
           ?      0.530     0.445     0.483       443
           ;      0.083     0.096     0.089       622
           .      0.909     0.902     0.906     11401
           -      0.111     0.121     0.116       882
          ,(      0.000     0.000     0.000         5
           "      0.155     0.130     0.141      1596
         ").      0.000     0.000     0.000         0
           (      0.917     0.790     0.849      1656
           )      0.816     0.650     0.724      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.067     0.050     0.057        20
          :"      0.143     0.060     0.084        67
          ."      0.000    



              precision    recall  f1-score   support

           ,      0.560     0.618     0.588     18592
           (      0.917     0.790     0.849      1656
           )      0.816     0.650     0.724      1001
           "      0.155     0.130     0.141      1596
           [      0.000     0.000     0.000        45
           ]      0.000     0.000     0.000        43
           ;      0.083     0.096     0.089       622
           .      0.909     0.902     0.906     11401
           ?      0.530     0.445     0.483       443
           !      0.216     0.213     0.215       347
           :      0.601     0.618     0.609       923
           -      0.111     0.121     0.116       882
         ...      0.000     0.000     0.000        61

   micro avg      0.642     0.663     0.653     37612
   macro avg      0.377     0.353     0.363     37612
weighted avg      0.648     0.663     0.655     37612

['Но синовете им не умъртви, защото постъпи според писаното в закона в книгата 

### Experiment further adjusting algorithm parameters

In [23]:
X_train = load_json('../../data/X_train.json')

In [24]:
y_train = load_json('../../data/y_train.json')

In [25]:
X_dev = load_json('../../data/X_dev.json')

In [26]:
y_dev = load_json('../../data/y_dev.json')

In [27]:
verify_prepped_data(X_train, y_train)

In [28]:
verify_prepped_data(X_dev, y_dev)

In [30]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='pa',
    max_iterations=200,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 34min 23s, sys: 1.61 s, total: 34min 25s
Wall time: 34min 22s


In [31]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels = [',','(',')','"','[',']',';','.','?','!',':','-', '...']
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.9356300161705335


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.964     0.979     0.972    227825
           :      0.859     0.705     0.775       923
           !      0.520     0.190     0.278       347
           ,      0.695     0.699     0.697     18592
           ?      0.845     0.711     0.772       443
           ;      0.278     0.024     0.044       622
           .      0.932     0.988     0.959     11401
           -      0.562     0.087     0.151       882
          ,(      0.000     0.000     0.000         5
           "      0.616     0.125     0.207      1596
         ").      0.000     0.000     0.000         0
           (      0.935     0.876     0.905      1656
           )      0.840     0.842     0.841      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.500    



              precision    recall  f1-score   support

           :      0.859     0.705     0.775       923
           !      0.520     0.190     0.278       347
           ,      0.695     0.699     0.697     18592
           ?      0.845     0.711     0.772       443
           ;      0.278     0.024     0.044       622
           .      0.932     0.988     0.959     11401
           -      0.562     0.087     0.151       882
          ,(      0.000     0.000     0.000         5
           "      0.616     0.125     0.207      1596
         ").      0.000     0.000     0.000         0
           (      0.935     0.876     0.905      1656
           )      0.840     0.842     0.841      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.500     0.164     0.247        67
          ."      0.000    



              precision    recall  f1-score   support

           ,      0.695     0.699     0.697     18592
           (      0.935     0.876     0.905      1656
           )      0.840     0.842     0.841      1001
           "      0.616     0.125     0.207      1596
           [      0.000     0.000     0.000        45
           ]      0.000     0.000     0.000        43
           ;      0.278     0.024     0.044       622
           .      0.932     0.988     0.959     11401
           ?      0.845     0.711     0.772       443
           !      0.520     0.190     0.278       347
           :      0.859     0.705     0.775       923
           -      0.562     0.087     0.151       882
         ...      0.000     0.000     0.000        61

   micro avg      0.794     0.741     0.767     37612
   macro avg      0.545     0.404     0.433     37612
weighted avg      0.769     0.741     0.741     37612

['Но синовете им не умъртви, защото постъпи според писаното в закона в книгата 

In [32]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='pa',
    max_iterations=300,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 52min 13s, sys: 2.14 s, total: 52min 15s
Wall time: 52min 11s


In [33]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels = [',','(',')','"','[',']',';','.','?','!',':','-', '...']
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.9359619480955952


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.964     0.979     0.972    227825
           :      0.858     0.718     0.782       923
           !      0.514     0.207     0.296       347
           ,      0.697     0.696     0.696     18592
           ?      0.845     0.716     0.775       443
           ;      0.222     0.029     0.051       622
           .      0.933     0.987     0.960     11401
           -      0.533     0.092     0.157       882
          ,(      0.000     0.000     0.000         5
           "      0.625     0.137     0.224      1596
         ").      0.000     0.000     0.000         0
           (      0.939     0.880     0.909      1656
           )      0.849     0.850     0.850      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.545    



              precision    recall  f1-score   support

           :      0.858     0.718     0.782       923
           !      0.514     0.207     0.296       347
           ,      0.697     0.696     0.696     18592
           ?      0.845     0.716     0.775       443
           ;      0.222     0.029     0.051       622
           .      0.933     0.987     0.960     11401
           -      0.533     0.092     0.157       882
          ,(      0.000     0.000     0.000         5
           "      0.625     0.137     0.224      1596
         ").      0.000     0.000     0.000         0
           (      0.939     0.880     0.909      1656
           )      0.849     0.850     0.850      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.545     0.179     0.270        67
          ."      1.000    



              precision    recall  f1-score   support

           ,      0.697     0.696     0.696     18592
           (      0.939     0.880     0.909      1656
           )      0.849     0.850     0.850      1001
           "      0.625     0.137     0.224      1596
           [      0.000     0.000     0.000        45
           ]      0.000     0.000     0.000        43
           ;      0.222     0.029     0.051       622
           .      0.933     0.987     0.960     11401
           ?      0.845     0.716     0.775       443
           !      0.514     0.207     0.296       347
           :      0.858     0.718     0.782       923
           -      0.533     0.092     0.157       882
         ...      0.000     0.000     0.000        61

   micro avg      0.795     0.741     0.767     37612
   macro avg      0.540     0.409     0.438     37612
weighted avg      0.770     0.741     0.743     37612

['Но синовете им не умъртви, защото постъпи според писаното в закона в книгата 

In [34]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='arow',
    max_iterations=200,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 33min 7s, sys: 969 ms, total: 33min 8s
Wall time: 33min 5s


In [35]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels = [',','(',')','"','[',']',';','.','?','!',':','-', '...']
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.9047636440834411


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.958     0.944     0.951    227825
           :      0.602     0.615     0.609       923
           !      0.227     0.207     0.217       347
           ,      0.560     0.608     0.583     18592
           ?      0.597     0.381     0.466       443
           ;      0.072     0.085     0.078       622
           .      0.918     0.892     0.905     11401
           -      0.078     0.095     0.086       882
          ,(      0.000     0.000     0.000         5
           "      0.169     0.146     0.157      1596
         ").      0.000     0.000     0.000         0
           (      0.904     0.807     0.853      1656
           )      0.824     0.656     0.731      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.016     0.050     0.024        20
          :"      0.171    



              precision    recall  f1-score   support

           :      0.602     0.615     0.609       923
           !      0.227     0.207     0.217       347
           ,      0.560     0.608     0.583     18592
           ?      0.597     0.381     0.466       443
           ;      0.072     0.085     0.078       622
           .      0.918     0.892     0.905     11401
           -      0.078     0.095     0.086       882
          ,(      0.000     0.000     0.000         5
           "      0.169     0.146     0.157      1596
         ").      0.000     0.000     0.000         0
           (      0.904     0.807     0.853      1656
           )      0.824     0.656     0.731      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.016     0.050     0.024        20
          :"      0.171     0.090     0.118        67
          ."      0.000    



              precision    recall  f1-score   support

           ,      0.560     0.608     0.583     18592
           (      0.904     0.807     0.853      1656
           )      0.824     0.656     0.731      1001
           "      0.169     0.146     0.157      1596
           [      0.000     0.000     0.000        45
           ]      0.000     0.000     0.000        43
           ;      0.072     0.085     0.078       622
           .      0.918     0.892     0.905     11401
           ?      0.597     0.381     0.466       443
           !      0.227     0.207     0.217       347
           :      0.602     0.615     0.609       923
           -      0.078     0.095     0.086       882
         ...      0.000     0.000     0.000        61

   micro avg      0.640     0.656     0.648     37612
   macro avg      0.381     0.346     0.360     37612
weighted avg      0.651     0.656     0.652     37612

['Но синовете им не умъртви, защото постъпи според писаното в закона в книгата 

In [36]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='arow',
    max_iterations=300,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 48min 11s, sys: 1.13 s, total: 48min 12s
Wall time: 48min 8s


In [37]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels = [',','(',')','"','[',']',';','.','?','!',':','-', '...']
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.9058672396568239


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.957     0.946     0.952    227825
           :      0.592     0.601     0.597       923
           !      0.273     0.219     0.243       347
           ,      0.564     0.611     0.586     18592
           ?      0.587     0.388     0.467       443
           ;      0.082     0.092     0.087       622
           .      0.920     0.898     0.909     11401
           -      0.109     0.108     0.108       882
          ,(      0.000     0.000     0.000         5
           "      0.184     0.148     0.164      1596
         ").      0.000     0.000     0.000         0
           (      0.919     0.806     0.859      1656
           )      0.810     0.671     0.734      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.062     0.050     0.056        20
          :"      0.132    



              precision    recall  f1-score   support

           :      0.592     0.601     0.597       923
           !      0.273     0.219     0.243       347
           ,      0.564     0.611     0.586     18592
           ?      0.587     0.388     0.467       443
           ;      0.082     0.092     0.087       622
           .      0.920     0.898     0.909     11401
           -      0.109     0.108     0.108       882
          ,(      0.000     0.000     0.000         5
           "      0.184     0.148     0.164      1596
         ").      0.000     0.000     0.000         0
           (      0.919     0.806     0.859      1656
           )      0.810     0.671     0.734      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.062     0.050     0.056        20
          :"      0.132     0.075     0.095        67
          ."      0.000    



              precision    recall  f1-score   support

           ,      0.564     0.611     0.586     18592
           (      0.919     0.806     0.859      1656
           )      0.810     0.671     0.734      1001
           "      0.184     0.148     0.164      1596
           [      0.000     0.000     0.000        45
           ]      0.000     0.000     0.000        43
           ;      0.082     0.092     0.087       622
           .      0.920     0.898     0.909     11401
           ?      0.587     0.388     0.467       443
           !      0.273     0.219     0.243       347
           :      0.592     0.601     0.597       923
           -      0.109     0.108     0.108       882
         ...      0.000     0.000     0.000        61

   micro avg      0.652     0.659     0.655     37612
   macro avg      0.388     0.349     0.366     37612
weighted avg      0.655     0.659     0.656     37612

['Но синовете им не умъртви, защото постъпи според писаното в закона в книгата 

In [38]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 6h 45min 59s, sys: 11.2 s, total: 6h 46min 10s
Wall time: 6h 45min 34s


In [39]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels = [',','(',')','"','[',']',';','.','?','!',':','-', '...']
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.9400507737000229


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.961     0.988     0.974    227825
           :      0.885     0.736     0.804       923
           !      0.490     0.219     0.303       347
           ,      0.786     0.653     0.714     18592
           ?      0.834     0.702     0.762       443
           ;      0.235     0.061     0.097       622
           .      0.936     0.987     0.961     11401
           -      0.536     0.111     0.184       882
          ,(      0.000     0.000     0.000         5
           "      0.646     0.148     0.241      1596
         ").      0.000     0.000     0.000         0
           (      0.959     0.881     0.918      1656
           )      0.880     0.834     0.856      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.667     0.100     0.174        20
          :"      0.667    



              precision    recall  f1-score   support

           :      0.885     0.736     0.804       923
           !      0.490     0.219     0.303       347
           ,      0.786     0.653     0.714     18592
           ?      0.834     0.702     0.762       443
           ;      0.235     0.061     0.097       622
           .      0.936     0.987     0.961     11401
           -      0.536     0.111     0.184       882
          ,(      0.000     0.000     0.000         5
           "      0.646     0.148     0.241      1596
         ").      0.000     0.000     0.000         0
           (      0.959     0.881     0.918      1656
           )      0.880     0.834     0.856      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.667     0.100     0.174        20
          :"      0.667     0.269     0.383        67
          ."      0.500    



              precision    recall  f1-score   support

           ,      0.786     0.653     0.714     18592
           (      0.959     0.881     0.918      1656
           )      0.880     0.834     0.856      1001
           "      0.646     0.148     0.241      1596
           [      0.000     0.000     0.000        45
           ]      0.000     0.000     0.000        43
           ;      0.235     0.061     0.097       622
           .      0.936     0.987     0.961     11401
           ?      0.834     0.702     0.762       443
           !      0.490     0.219     0.303       347
           :      0.885     0.736     0.804       923
           -      0.536     0.111     0.184       882
         ...      0.000     0.000     0.000        61

   micro avg      0.849     0.721     0.780     37612
   macro avg      0.553     0.410     0.449     37612
weighted avg      0.818     0.721     0.755     37612

['Но синовете им не умъртви, защото постъпи според писаното в закона в книгата 

### Do Grid Search on half of the train set

In [32]:
from sklearn.metrics import make_scorer
import scipy.stats
from sklearn.model_selection import RandomizedSearchCV

In [33]:
labels = [',','(',')','"','[',']',';','.','?','!',':','-', '...']

In [34]:
%%time
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True,
    linesearch='MoreThuente'
    
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
#     'all_possible_transitions': [True, False],
#     'linesearch': ['MoreThuente', 'Backtracking', 'StrongBacktracking'],
    'num_memories': [4, 6, 8],
    'period': [8, 10, 12],
    'epsilon': scipy.stats.loguniform(1e-6, 1e-4),
    'delta': scipy.stats.loguniform(1e-6, 1e-4)
}

CPU times: user 4.41 ms, sys: 51 µs, total: 4.46 ms
Wall time: 4.33 ms


In [35]:
%%time
# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train[:2000], y_train[:2000])

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 12.8min finished


CPU times: user 12min 48s, sys: 5.63 s, total: 12min 53s
Wall time: 13min 7s


In [36]:
crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)

crf = rs.best_estimator_
y_pred = crf.predict(X_dev)

print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

best params: {'c1': 0.3952681242447892, 'c2': 0.05986417647894812, 'delta': 2.023975781529585e-06, 'epsilon': 1.797571535000269e-06, 'num_memories': 6, 'period': 12}
best CV score: 0.6694467873383076


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           ,      0.604     0.577     0.590     18592
           (      0.000     0.000     0.000      1656
           )      0.000     0.000     0.000      1001
           "      0.000     0.000     0.000      1596
           [      0.000     0.000     0.000        45
           ]      0.000     0.000     0.000        43
           ;      0.165     0.034     0.056       622
           .      0.895     0.967     0.930     11401
           ?      0.615     0.828     0.706       443
           !      0.347     0.095     0.149       347
           :      0.336     0.684     0.451       923
           -      0.158     0.018     0.033       882
         ...      0.000     0.000     0.000        61

   micro avg      0.694     0.607     0.647     37612
   macro avg      0.240     0.246     0.224     37612
weighted avg      0.595     0.607     0.596     37612

['Но синовете им не умъртви, защото постъпи според писаното в закона в книгата 

In [38]:
%%time
# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train[:5000], y_train[:5000])

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 44.3min finished


CPU times: user 44min 20s, sys: 19.9 s, total: 44min 40s
Wall time: 45min 38s


In [39]:
crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)

crf = rs.best_estimator_
y_pred = crf.predict(X_dev)

print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

best params: {'c1': 0.6107718063593071, 'c2': 0.018728827768043445, 'delta': 1.0572740284119278e-05, 'epsilon': 2.1145487936128027e-05, 'num_memories': 8, 'period': 12}
best CV score: 0.698636789224719


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           ,      0.581     0.614     0.597     18592
           (      0.000     0.000     0.000      1656
           )      0.000     0.000     0.000      1001
           "      0.000     0.000     0.000      1596
           [      0.000     0.000     0.000        45
           ]      0.000     0.000     0.000        43
           ;      0.164     0.047     0.073       622
           .      0.897     0.963     0.929     11401
           ?      0.655     0.833     0.734       443
           !      0.298     0.184     0.228       347
           :      0.479     0.729     0.578       923
           -      0.173     0.026     0.045       882
         ...      0.000     0.000     0.000        61

   micro avg      0.685     0.626     0.655     37612
   macro avg      0.250     0.261     0.245     37612
weighted avg      0.588     0.626     0.604     37612

['Но синовете им не умъртви, защото постъпи според писаното в закона в книгата 

In [46]:
%%time
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True,
    linesearch='Backtracking'
    
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
#     'all_possible_transitions': [True, False],
#     'linesearch': ['MoreThuente', 'Backtracking', 'StrongBacktracking'],
    'num_memories': [4, 6, 8],
    'period': [8, 10, 12],
    'epsilon': scipy.stats.loguniform(1e-6, 1e-4),
    'delta': scipy.stats.loguniform(1e-6, 1e-4)
}

CPU times: user 2.13 ms, sys: 0 ns, total: 2.13 ms
Wall time: 2.12 ms


In [47]:
%%time
# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train[:5000], y_train[:5000])

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 43.7min finished


CPU times: user 44min, sys: 17.4 s, total: 44min 17s
Wall time: 45min 10s


In [48]:
crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)

crf = rs.best_estimator_
y_pred = crf.predict(X_dev)

print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

best params: {'c1': 0.5323290430655947, 'c2': 0.09804313996988119, 'delta': 9.061729324618556e-06, 'epsilon': 3.2230632072995785e-05, 'num_memories': 8, 'period': 10}
best CV score: 0.6995873649879396


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           ,      0.566     0.627     0.595     18592
           (      0.000     0.000     0.000      1656
           )      0.000     0.000     0.000      1001
           "      0.000     0.000     0.000      1596
           [      0.000     0.000     0.000        45
           ]      0.000     0.000     0.000        43
           ;      0.162     0.040     0.064       622
           .      0.897     0.965     0.930     11401
           ?      0.643     0.849     0.732       443
           !      0.339     0.176     0.231       347
           :      0.487     0.728     0.584       923
           -      0.219     0.028     0.050       882
         ...      0.000     0.000     0.000        61

   micro avg      0.675     0.633     0.654     37612
   macro avg      0.255     0.263     0.245     37612
weighted avg      0.582     0.633     0.603     37612

['Но синовете им не умъртви, защото постъпи според писаното в закона в книгата 

In [49]:
%%time
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True,
    linesearch='StrongBacktracking'
    
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
#     'all_possible_transitions': [True, False],
#     'linesearch': ['MoreThuente', 'Backtracking', 'StrongBacktracking'],
    'num_memories': [4, 6, 8],
    'period': [8, 10, 12],
    'epsilon': scipy.stats.loguniform(1e-6, 1e-4),
    'delta': scipy.stats.loguniform(1e-6, 1e-4)
}

CPU times: user 3.62 ms, sys: 0 ns, total: 3.62 ms
Wall time: 2.92 ms


In [50]:
%%time
# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train[:5000], y_train[:5000])

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 41.7min finished


CPU times: user 42min, sys: 14.9 s, total: 42min 15s
Wall time: 43min 9s


In [51]:
crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)

crf = rs.best_estimator_
y_pred = crf.predict(X_dev)

print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

best params: {'c1': 0.5627689794744889, 'c2': 0.08851643184326857, 'delta': 1.756419310822094e-05, 'epsilon': 2.408864673024824e-06, 'num_memories': 8, 'period': 12}
best CV score: 0.6989675133908441


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           ,      0.582     0.614     0.598     18592
           (      0.000     0.000     0.000      1656
           )      0.000     0.000     0.000      1001
           "      0.000     0.000     0.000      1596
           [      0.000     0.000     0.000        45
           ]      0.000     0.000     0.000        43
           ;      0.168     0.039     0.063       622
           .      0.898     0.964     0.929     11401
           ?      0.632     0.862     0.730       443
           !      0.326     0.170     0.223       347
           :      0.513     0.718     0.599       923
           -      0.208     0.028     0.050       882
         ...      0.000     0.000     0.000        61

   micro avg      0.689     0.626     0.656     37612
   macro avg      0.256     0.261     0.246     37612
weighted avg      0.591     0.626     0.605     37612

['Но синовете им не умъртви, защото постъпи според писаното в закона в книгата 

In [52]:
%%time
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=False,
    linesearch='MoreThuente'
    
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
#     'all_possible_transitions': [True, False],
#     'linesearch': ['MoreThuente', 'Backtracking', 'StrongBacktracking'],
    'num_memories': [4, 6, 8],
    'period': [8, 10, 12],
    'epsilon': scipy.stats.loguniform(1e-6, 1e-4),
    'delta': scipy.stats.loguniform(1e-6, 1e-4)
}

CPU times: user 4.04 ms, sys: 0 ns, total: 4.04 ms
Wall time: 3.99 ms


In [53]:
%%time
# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train[:5000], y_train[:5000])

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 50.8min finished


CPU times: user 50min 46s, sys: 20.2 s, total: 51min 6s
Wall time: 52min 3s


In [54]:
crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)

crf = rs.best_estimator_
y_pred = crf.predict(X_dev)

print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

best params: {'c1': 0.49652630456506613, 'c2': 0.12057673150182804, 'delta': 1.7295250925683704e-05, 'epsilon': 4.6501223090121836e-05, 'num_memories': 8, 'period': 8}
best CV score: 0.6985161971603957


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           ,      0.564     0.621     0.591     18592
           (      0.000     0.000     0.000      1656
           )      0.000     0.000     0.000      1001
           "      0.000     0.000     0.000      1596
           [      0.000     0.000     0.000        45
           ]      0.000     0.000     0.000        43
           ;      0.144     0.035     0.057       622
           .      0.896     0.965     0.929     11401
           ?      0.651     0.856     0.740       443
           !      0.304     0.141     0.193       347
           :      0.459     0.722     0.561       923
           -      0.216     0.028     0.050       882
         ...      0.000     0.000     0.000        61

   micro avg      0.673     0.630     0.651     37612
   macro avg      0.249     0.259     0.240     37612
weighted avg      0.580     0.630     0.600     37612

['Но синовете им не умъртви, защото постъпи според писаното в закона в книгата 

In [55]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train[:5000], y_train[:5000])

CPU times: user 1min 19s, sys: 32 ms, total: 1min 19s
Wall time: 1min 19s


In [56]:
y_pred = crf.predict(X_dev)

print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           ,      0.638     0.576     0.605     18592
           (      0.000     0.000     0.000      1656
           )      0.000     0.000     0.000      1001
           "      0.000     0.000     0.000      1596
           [      0.000     0.000     0.000        45
           ]      0.000     0.000     0.000        43
           ;      0.141     0.045     0.068       622
           .      0.895     0.969     0.931     11401
           ?      0.681     0.810     0.740       443
           !      0.362     0.170     0.231       347
           :      0.514     0.717     0.599       923
           -      0.165     0.026     0.045       882
         ...      0.000     0.000     0.000        61

   micro avg      0.728     0.608     0.663     37612
   macro avg      0.261     0.255     0.248     37612
weighted avg      0.617     0.608     0.609     37612

['Но синовете им не умъртви, защото постъпи според писаното в закона в книгата 

### Try out optimal algorithm with fine-tuned parameters

In [29]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    all_possible_transitions=True,
    linesearch='MoreThuente',
    num_memories=8,
    period=12,
    epsilon=2e-6,
    delta=1.5e-5  
)
crf.fit(X_train, y_train)

CPU times: user 6h 55min 46s, sys: 16.8 s, total: 6h 56min 3s
Wall time: 6h 55min 24s


In [30]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels = [',','(',')','"','[',']',';','.','?','!',':','-', '...']
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.9400177135473814


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.961     0.988     0.974    227825
           :      0.889     0.736     0.805       923
           !      0.497     0.222     0.307       347
           ,      0.787     0.653     0.714     18592
           ?      0.834     0.702     0.762       443
           ;      0.219     0.056     0.090       622
           .      0.936     0.987     0.961     11401
           -      0.527     0.111     0.184       882
          ,(      0.000     0.000     0.000         5
           "      0.639     0.145     0.237      1596
         ").      0.000     0.000     0.000         0
           (      0.957     0.881     0.918      1656
           )      0.878     0.835     0.856      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.667     0.100     0.174        20
          :"      0.679    



              precision    recall  f1-score   support

           :      0.889     0.736     0.805       923
           !      0.497     0.222     0.307       347
           ,      0.787     0.653     0.714     18592
           ?      0.834     0.702     0.762       443
           ;      0.219     0.056     0.090       622
           .      0.936     0.987     0.961     11401
           -      0.527     0.111     0.184       882
          ,(      0.000     0.000     0.000         5
           "      0.639     0.145     0.237      1596
         ").      0.000     0.000     0.000         0
           (      0.957     0.881     0.918      1656
           )      0.878     0.835     0.856      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.667     0.100     0.174        20
          :"      0.679     0.284     0.400        67
          ."      0.500    



              precision    recall  f1-score   support

           ,      0.787     0.653     0.714     18592
           (      0.957     0.881     0.918      1656
           )      0.878     0.835     0.856      1001
           "      0.639     0.145     0.237      1596
           [      0.000     0.000     0.000        45
           ]      0.000     0.000     0.000        43
           ;      0.219     0.056     0.090       622
           .      0.936     0.987     0.961     11401
           ?      0.834     0.702     0.762       443
           !      0.497     0.222     0.307       347
           :      0.889     0.736     0.805       923
           -      0.527     0.111     0.184       882
         ...      0.000     0.000     0.000        61

   micro avg      0.849     0.721     0.780     37612
   macro avg      0.551     0.410     0.449     37612
weighted avg      0.817     0.721     0.755     37612

['Но синовете им не умъртви, защото постъпи според писаното в закона в книгата 