### Function Definitions

In [1]:
import json
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import classla
import json
import re

In [2]:
# input - name of a text file with one sentence per line
# output - list of sentences (strings)
def read_file_as_list_of_sentences(input_file_name):
    with open(input_file_name, "r") as input_file:
        return input_file.read().splitlines()

In [3]:
# input - list of sentences (strings) and a classla pipeline (POS tokenize or tokenize)
# output - list of dictionaries ((POS) tokenized sentences) 
def run_through_classla_pipeline(list_of_sentences, pipeline):
    return [pipeline(sentence).to_dict()[0][0] for sentence in list_of_sentences]

In [4]:
# input - list of dictionaries (a dictionary for each word - the tokenized version of the sentence)
# output - list of dictionaries (dictionaries that contain punctuation one after another are squashed into one)
def squash_punctuation(sentence):
    new_sentence = []

    for i in range(len(sentence)):
        if sentence[i]['text'] in ',()"[];.?!:-':
            if len(new_sentence) > 0 and all(character in ',()"[];.?!:-' for character in new_sentence[-1]['text']):
                new_sentence[-1]['text'] += sentence[i]['text']
            else:
                new_sentence.append(sentence[i])
        else:
            new_sentence.append(sentence[i])
            
    return new_sentence

In [5]:
# input - sentence and an index of a word (dictionary) - assigns a label to each word based on the punctuation after
# output - a label - None for punctuation, punctuation for words followed by punctuation and empty for words if they are not
def word2label(sentence, i):
    if all(character in ',()"[];.?!:-' for character in sentence[i]['text']):
        return None

    if i < len(sentence) - 1:
        if all(character in ',()"[];.?!:-' for character in sentence[i+1]['text']):
            label = sentence[i+1]['text']
            return label
    
    return ''

In [6]:
# input - sentence (list of dictionaries)
# output - list of labels (strings) with None labels for punctuation being filtered out
def sent2labels(sentence):
    return [label for label in (word2label(sentence, i) for i in range(len(sentence))) if label != None]

In [7]:
# input - list of dictionaries (a dictionary for each word - the tokenized version of the sentence)
# output - new sentence (string) with the punctuation removed
def remove_punctuation(sentence):
    new_sentence = ''

    for i in range(len(sentence)):
        if all(character in ',()"[];.?!:-' for character in sentence[i]['text']):
            pass
        else:
            new_sentence = new_sentence + sentence[i]['text'] + ' '
    
    return new_sentence

In [8]:
def contains_interrogative_word(sentence):
    for i in range(len(sentence)):
        if sentence[i]['xpos'].startswith('Pi'):
            return True, i
    
    return False, -1

In [9]:
def contains_interrogative_particle(sentence):
    for i in range(len(sentence)):
        if sentence[i]['xpos'] == 'Ti':
            return True, i
    
    return False, -1

In [10]:
def contains_imperative_verb(sentence):
    for i in range(len(sentence)):
        if sentence[i]['xpos'][0] == 'V' and sentence[i]['xpos'][4] == 'z':
            return True, i
    
    return False, -1

In [11]:
def contains_relative_pronoun(sentence):
    for i in range(len(sentence)):
        if sentence[i]['xpos'].startswith('Pr'):
            return True, i
    
    return False, -1

In [23]:
def contains_repetitive_coord_conj_before(sentence, i):
    for word_i in range(len(sentence)):
        if word_i < i:
            if sentence[word_i]['xpos'] in ('Cr', 'Cp'):
                return True, word_i
    
    return False, -1

In [13]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos']
        })
    else:
        features.update({
            'BOS': True
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos']
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos']
        })

    return features

In [14]:
# input - sentence (list of dictionaries)
# output - list of features (dictionaries, for each word)
def sent2features(sentence):
    return [word2features(sentence, i) for i in range(len(sentence))]

In [15]:
# input - JSON-serializable data and name of the output JSON file
# output - None, saves the sentences to a JSON file
def save_as_json(data, output_file_name):
    with open(output_file_name, "w") as output_file:
        json.dump(data, output_file) 

In [16]:
# input - JSON file
# output - the contents of the JSON file as an object
def load_json(json_file_name):
    with open(json_file_name, "r") as json_file:
        return json.load(json_file)

In [17]:
# input - name of a text file with one sentence per line and a variable indicating whether or not to save X and y to JSON
# output - X and y - features and labels
def data_prep(input_file_name, json_serialize=False):
    data = read_file_as_list_of_sentences(input_file_name)
    nlp_tokenize = classla.Pipeline('bg', processors='tokenize')
    tokenized_data = run_through_classla_pipeline(data, nlp_tokenize)
    
    if len(data) != len(tokenized_data):
        print("Warning: Mismatch in the count of the data and tokenized data")

    squashed_tokenized_data = [squash_punctuation(sentence) for sentence in tokenized_data]

    if len(tokenized_data) != len(squashed_tokenized_data):
        print("Warning: Mismatch in the count of the tokenized and squashed tokenized data")
    
    y = [sent2labels(sentence) for sentence in squashed_tokenized_data]
    
    if len(squashed_tokenized_data) != len(y):
        print("Warning: Mismatch in the count of the squashed tokenized data and labeled data")
    
    data_without_punctuation = [remove_punctuation(sentence) for sentence in squashed_tokenized_data]
    
    if len(data_without_punctuation) != len(y):
        print("Warning: Mismatch in the count of the data without punctuation and labeled data")
    
    nlp_pos_tokenize = classla.Pipeline('bg', processors='tokenize,pos')   
    pos_tokenized_data = run_through_classla_pipeline(data_without_punctuation, nlp_pos_tokenize)
    
    if len(data_without_punctuation) != len(pos_tokenized_data):
        print("Warning: Mismatch in the count of the data without punctuation and POS tokenized data")
    
    X = [sent2features(sentence) for sentence in pos_tokenized_data]
    
    if len(X) != len(pos_tokenized_data):
        print("Warning: Mismatch in the count of the prepped data and POS tokenized data")
    
    if json_serialize:
        save_as_json(X, re.sub('\.txt', '_X.json', input_file_name))
        save_as_json(y, re.sub('\.txt', '_y.json', input_file_name))
        
    return X, y

In [18]:
# input - X and y - features and labels
# output - None, prints on the screen the features-labels pairs that have length mismatch
def verify_prepped_data(X, y):
    for feat, label in zip(X, y):
        if len(feat) != len(label):
            print(feat, label)

In [19]:
# input - X and y - features and labels
# output - list of punctuated sentences (strings; y labels applied to X)
def punctuate(X, y):
    punctuated_sentences = []

    for feat, label in zip(X, y):
        sentence = ''

        for i in range(len(feat)):
            sentence = sentence + feat[i]['word'] + label[i] + ' '
        
        punctuated_sentences.append(sentence.rstrip())
    
    return punctuated_sentences

### Data Prep

In [23]:
%%time
X_train, y_train = data_prep('../data/Train.txt')

2021-09-28 15:09:41 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-28 15:09:41 INFO: Use device: gpu
2021-09-28 15:09:41 INFO: Loading: tokenize
2021-09-28 15:09:41 INFO: Done loading processors!
2021-09-28 15:10:02 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-28 15:10:02 INFO: Use device: gpu
2021-09-28 15:10:02 INFO: Loading: tokenize
2021-09-28 15:10:02 INFO: Loading: pos
2021-09-28 15:10:08 INFO: Done loading processors!


CPU times: user 23min 47s, sys: 3.55 s, total: 23min 51s
Wall time: 23min 50s


In [19]:
verify_prepped_data(X_train, y_train)

In [26]:
%%time
X_dev, y_dev = data_prep('../data/Dev.txt')

2021-09-28 16:53:08 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-28 16:53:08 INFO: Use device: gpu
2021-09-28 16:53:08 INFO: Loading: tokenize
2021-09-28 16:53:08 INFO: Done loading processors!
2021-09-28 16:53:16 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-28 16:53:16 INFO: Use device: gpu
2021-09-28 16:53:16 INFO: Loading: tokenize
2021-09-28 16:53:16 INFO: Loading: pos
2021-09-28 16:53:17 INFO: Done loading processors!


CPU times: user 11min 46s, sys: 1.19 s, total: 11min 47s
Wall time: 11min 46s


In [21]:
verify_prepped_data(X_dev, y_dev)

### CRF Model

In [24]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 48min 25s, sys: 2.73 s, total: 48min 28s
Wall time: 48min 21s


In [27]:
labels = list(crf.classes_)

In [28]:
labels

['',
 ':',
 '!',
 ',',
 '?',
 ';',
 '.',
 '-',
 ',(',
 '"',
 '").',
 '(',
 ')',
 '?-',
 '.-',
 ':)',
 '!-',
 ':"',
 '."',
 ':(',
 '?)',
 '"?',
 '".',
 '";',
 '"-',
 ').',
 '!,',
 '-"',
 ')!',
 '",',
 '.")',
 '!;',
 ';(',
 '),',
 '?;',
 '?"',
 ');',
 '.)',
 '?,',
 ';"',
 '):',
 '[',
 ']',
 '...',
 '],',
 '"(',
 ',[',
 ',"',
 ']",',
 ')-',
 '":',
 '("',
 '"),',
 '"-[',
 '...,',
 '")',
 '..',
 '","',
 '?".',
 '"[',
 '"("',
 '),"',
 '""',
 '-.',
 '.,',
 '].',
 ')"',
 '!"',
 '?"-',
 '":"',
 '):"',
 '")-',
 '"-"',
 '-,',
 ')".',
 '?",',
 ']".',
 '!",',
 ']?',
 '...[',
 '];',
 '...".',
 '!"-',
 '"!',
 '!".',
 '!?',
 '?!',
 ',-',
 '!...',
 '!).',
 ')?',
 '?...',
 '.?']

In [29]:
y_pred = crf.predict(X_dev)

In [30]:
metrics.flat_f1_score(y_dev, y_pred,
                      average='weighted', labels=labels)

  average, "true nor predicted", 'F-score is', len(true_sum)


0.8866900230240949

In [31]:
# show all metrics
# sorted_labels = sorted(
#     labels,
#     key=lambda name: (name[1:], name[0])
# )
print(metrics.flat_classification_report(
    y_dev, y_pred, labels=labels, digits=3
))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.926     0.970     0.948    227825
           :      0.704     0.476     0.568       923
           !      0.552     0.245     0.339       347
           ,      0.452     0.298     0.359     18592
           ?      0.892     0.747     0.813       443
           ;      0.128     0.018     0.031       622
           .      0.923     0.985     0.953     11401
           -      0.484     0.051     0.092       882
          ,(      0.000     0.000     0.000         5
           "      0.276     0.034     0.061      1596
         ").      0.000     0.000     0.000         0
           (      0.854     0.723     0.783      1656
           )      0.696     0.576     0.631      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      1.000    

In [32]:
labels.remove('')

In [33]:
y_pred = crf.predict(X_dev)

In [34]:
metrics.flat_f1_score(y_dev, y_pred,
                      average='weighted', labels=labels)

  average, "true nor predicted", 'F-score is', len(true_sum)


0.5366036058465298

In [35]:
print(metrics.flat_classification_report(
    y_dev, y_pred, labels=labels, digits=3
))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           :      0.704     0.476     0.568       923
           !      0.552     0.245     0.339       347
           ,      0.452     0.298     0.359     18592
           ?      0.892     0.747     0.813       443
           ;      0.128     0.018     0.031       622
           .      0.923     0.985     0.953     11401
           -      0.484     0.051     0.092       882
          ,(      0.000     0.000     0.000         5
           "      0.276     0.034     0.061      1596
         ").      0.000     0.000     0.000         0
           (      0.854     0.723     0.783      1656
           )      0.696     0.576     0.631      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      1.000     0.015     0.029        67
          ."      0.000    

In [31]:
punctuated_sentences = punctuate(X_dev, y_pred)

In [33]:
punctuated_sentences[:100]

['Но синовете им не умъртви, защото постъпи според писаното в закона в книгата на Мойсей, където ГОСПОД заповяда и каза: Бащите да не умират заради синовете и синовете да не умират заради бащите, а всеки да умира за собствения си грях.',
 'Но когато ти говоря ще отворя устата ти и ти им кажи: Така казва Господ БОГ: Който слуша, нека слуша, а който не слуша нека не слуша, защото са бунтовен дом.',
 'Защото кой е по-голям този, който седи на трапезата ли или онзи, който слугува?',
 'А след това ще се насели, както в предишните дни, заявява ГОСПОД.',
 'А Той беше на задната част, заспал на възглавница.',
 'Ефрем е като птица славата му ще отлети.',
 'И така той си отиде от него на известно разстояние.',
 'Не бягам при халдейците!',
 'И израилевите синове излязоха от египетската земя, строени.',
 'А когато Ахитофел видя, че съветът му не се изпълни оседла магарето си и стана и отиде у дома си в своя град и нареди домашните си работи и се обеси.',
 'Ако умре някое животно от добитъка, който

In [48]:
punctuated_sentences[:100]

['Но синовете им не умъртви, защото постъпи според писаното в закона в книгата на Мойсей, където ГОСПОД заповяда и каза: Бащите да не умират заради синовете и синовете да не умират заради бащите, а всеки да умира за собствения си грях.',
 'Но когато ти говоря ще отворя устата ти и ти им кажи: Така казва Господ БОГ: Който слуша нека слуша, а който не слуша нека не слуша, защото са бунтовен дом.',
 'Защото кой е по-голям този, който седи на трапезата ли или онзи, който слугува.',
 'А след това ще се насели, както в предишните дни, заявява ГОСПОД.',
 'А Той беше на задната част заспал на възглавница.',
 'Ефрем е като птица славата му ще отлети.',
 'И така той си отиде от него на известно разстояние.',
 'Не бягам при халдейците!',
 'И израилевите синове излязоха от египетската земя строени.',
 'А когато Ахитофел видя, че съветът му не се изпълни оседла магарето си и стана и отиде у дома си в своя град и нареди домашните си работи и се обеси.',
 'Ако умре някое животно от добитъка, който мо

In [49]:
correct_punctuated_sentences = punctuate(X_dev, y_dev)

In [50]:
correct_punctuated_sentences[:100]

['Но синовете им не умъртви, защото постъпи, според писаното в закона, в книгата на Мойсей, където ГОСПОД заповяда и каза: Бащите да не умират заради синовете и синовете да не умират заради бащите, а всеки да умира за собствения си грях.',
 'Но когато ти говоря, ще отворя устата ти и ти им кажи: Така казва Господ БОГ: Който слуша, нека слуша, а който не слуша, нека не слуша- защото са бунтовен дом.',
 'Защото кой е по-голям: този, който седи на трапезата ли, или онзи, който слугува?',
 'А след това ще се насели както в предишните дни, заявява ГОСПОД.',
 'А Той беше на задната част, заспал на възглавница.',
 'Ефрем е като птица, славата му ще отлети.',
 'И така, той си отиде от него на известно разстояние.',
 'Не бягам при халдейците.',
 'И израилевите синове излязоха от египетската земя строени.',
 'А когато Ахитофел видя, че съветът му не се изпълни, оседла магарето си и стана, и отиде у дома си, в своя град, и нареди домашните си работи, и се обеси.',
 'Ако умре някое животно от доби

### Grid Search Try - Worse Results

In [None]:
# from sklearn.metrics import make_scorer
# import scipy.stats
# from sklearn.model_selection import RandomizedSearchCV

In [None]:
# %%time
# # define fixed parameters and parameters to search
# crf = sklearn_crfsuite.CRF(
#     algorithm='lbfgs',
#     max_iterations=100,
#     all_possible_transitions=True
# )
# params_space = {
#     'c1': scipy.stats.expon(scale=0.5),
#     'c2': scipy.stats.expon(scale=0.05),
# }

# # use the same metric for evaluation
# f1_scorer = make_scorer(metrics.flat_f1_score,
#                         average='weighted', labels=labels)

# # search
# rs = RandomizedSearchCV(crf, params_space,
#                         cv=3,
#                         verbose=1,
#                         n_jobs=-1,
#                         n_iter=50,
#                         scoring=f1_scorer)
# rs.fit(X, y)

In [None]:
# crf = rs.best_estimator_
# print('best params:', rs.best_params_)
# print('best CV score:', rs.best_score_)
# print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

In [None]:
# crf = rs.best_estimator_
# y_pred = crf.predict(X_test)

# sorted_labels = sorted(
#     labels,
#     key=lambda name: (name[1:], name[0])
# )

# print(metrics.flat_classification_report(
#     y_test, y_pred, labels=sorted_labels, digits=3
# ))

### Sentence contains imperative verb

In [23]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)
    sentence_contains_imperative_verb, imperative_verb_position = contains_imperative_verb(sentence)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position,
        'contains_imperative_verb': sentence_contains_imperative_verb,
        'imperative_verb_position': imperative_verb_position
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos']
        })
    else:
        features.update({
            'BOS': True
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos']
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos']
        })

    return features

In [24]:
%%time
X_train, y_train = data_prep('../data/Train.txt')

2021-09-29 00:35:06 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 00:35:06 INFO: Use device: gpu
2021-09-29 00:35:06 INFO: Loading: tokenize
2021-09-29 00:35:06 INFO: Done loading processors!
2021-09-29 00:35:30 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 00:35:30 INFO: Use device: gpu
2021-09-29 00:35:30 INFO: Loading: tokenize
2021-09-29 00:35:30 INFO: Loading: pos
2021-09-29 00:35:36 INFO: Done loading processors!


CPU times: user 23min 14s, sys: 3.57 s, total: 23min 18s
Wall time: 23min 17s


In [25]:
verify_prepped_data(X_train, y_train)

In [26]:
%%time
X_dev, y_dev = data_prep('../data/Dev.txt')

2021-09-29 00:58:24 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 00:58:24 INFO: Use device: gpu
2021-09-29 00:58:24 INFO: Loading: tokenize
2021-09-29 00:58:24 INFO: Done loading processors!
2021-09-29 00:58:37 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 00:58:37 INFO: Use device: gpu
2021-09-29 00:58:37 INFO: Loading: tokenize
2021-09-29 00:58:37 INFO: Loading: pos
2021-09-29 00:58:38 INFO: Done loading processors!


CPU times: user 11min 49s, sys: 1.09 s, total: 11min 50s
Wall time: 11min 49s


In [27]:
verify_prepped_data(X_dev, y_dev)

In [28]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 29min 3s, sys: 1.64 s, total: 29min 4s
Wall time: 29min 1s


In [29]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.9367597236634531


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.959     0.988     0.973    227825
           :      0.901     0.638     0.747       923
           !      0.524     0.190     0.279       347
           ,      0.790     0.640     0.707     18592
           ?      0.841     0.691     0.758       443
           ;      0.252     0.042     0.072       622
           .      0.924     0.982     0.952     11401
           -      0.519     0.093     0.158       882
          ,(      0.000     0.000     0.000         5
           "      0.601     0.128     0.212      1596
         ").      0.000     0.000     0.000         0
           (      0.963     0.874     0.917      1656
           )      0.755     0.759     0.757      1001
          ?-      0.333     0.250     0.286         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.500    



              precision    recall  f1-score   support

           :      0.901     0.638     0.747       923
           !      0.524     0.190     0.279       347
           ,      0.790     0.640     0.707     18592
           ?      0.841     0.691     0.758       443
           ;      0.252     0.042     0.072       622
           .      0.924     0.982     0.952     11401
           -      0.519     0.093     0.158       882
          ,(      0.000     0.000     0.000         5
           "      0.601     0.128     0.212      1596
         ").      0.000     0.000     0.000         0
           (      0.963     0.874     0.917      1656
           )      0.755     0.759     0.757      1001
          ?-      0.333     0.250     0.286         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.500     0.045     0.082        67
          ."      0.250    

### Sentence contains relative pronoun

In [30]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)
    sentence_contains_relative_pronoun, relative_pronoun_position = contains_relative_pronoun(sentence)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position,
        'contains_relative_pronoun': sentence_contains_relative_pronoun,
        'relative_pronoun_position': relative_pronoun_position
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos']
        })
    else:
        features.update({
            'BOS': True
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos']
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos']
        })

    return features

In [31]:
%%time
X_train, y_train = data_prep('../data/Train.txt')

2021-09-29 01:39:26 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 01:39:26 INFO: Use device: gpu
2021-09-29 01:39:26 INFO: Loading: tokenize
2021-09-29 01:39:26 INFO: Done loading processors!
2021-09-29 01:39:44 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 01:39:44 INFO: Use device: gpu
2021-09-29 01:39:44 INFO: Loading: tokenize
2021-09-29 01:39:44 INFO: Loading: pos
2021-09-29 01:39:45 INFO: Done loading processors!


CPU times: user 23min 16s, sys: 2.07 s, total: 23min 18s
Wall time: 23min 17s


In [32]:
verify_prepped_data(X_train, y_train)

In [33]:
%%time
X_dev, y_dev = data_prep('../data/Dev.txt')

2021-09-29 02:02:44 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 02:02:44 INFO: Use device: gpu
2021-09-29 02:02:44 INFO: Loading: tokenize
2021-09-29 02:02:44 INFO: Done loading processors!
2021-09-29 02:02:56 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 02:02:56 INFO: Use device: gpu
2021-09-29 02:02:56 INFO: Loading: tokenize
2021-09-29 02:02:56 INFO: Loading: pos
2021-09-29 02:02:58 INFO: Done loading processors!


CPU times: user 11min 47s, sys: 1.06 s, total: 11min 48s
Wall time: 11min 47s


In [34]:
verify_prepped_data(X_dev, y_dev)

In [35]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 30min 24s, sys: 1.36 s, total: 30min 25s
Wall time: 30min 22s


In [36]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.9360140355084204


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.957     0.990     0.973    227825
           :      0.902     0.641     0.750       923
           !      0.482     0.159     0.239       347
           ,      0.806     0.623     0.703     18592
           ?      0.832     0.702     0.761       443
           ;      0.190     0.032     0.055       622
           .      0.922     0.983     0.951     11401
           -      0.467     0.090     0.150       882
          ,(      0.000     0.000     0.000         5
           "      0.584     0.133     0.216      1596
         ").      0.000     0.000     0.000         0
           (      0.971     0.862     0.913      1656
           )      0.745     0.737     0.741      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.400    



              precision    recall  f1-score   support

           :      0.902     0.641     0.750       923
           !      0.482     0.159     0.239       347
           ,      0.806     0.623     0.703     18592
           ?      0.832     0.702     0.761       443
           ;      0.190     0.032     0.055       622
           .      0.922     0.983     0.951     11401
           -      0.467     0.090     0.150       882
          ,(      0.000     0.000     0.000         5
           "      0.584     0.133     0.216      1596
         ").      0.000     0.000     0.000         0
           (      0.971     0.862     0.913      1656
           )      0.745     0.737     0.741      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.400     0.030     0.056        67
          ."      0.077    

### Contains repetitive coordinative conjunction

In [24]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)
    sentence_contains_repetitive_coord_conj, repetitive_coord_conj_position = contains_repetitive_coord_conj_before(sentence, i)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position,
        'contains_repetitive_coord_conj_before': sentence_contains_repetitive_coord_conj,
        'repetitive_coord_conj_position': repetitive_coord_conj_position
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos']
        })
    else:
        features.update({
            'BOS': True
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos']
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos']
        })

    return features

In [25]:
%%time
X_train, y_train = data_prep('../data/Train.txt')

2021-09-29 09:46:52 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 09:46:52 INFO: Use device: gpu
2021-09-29 09:46:52 INFO: Loading: tokenize
2021-09-29 09:46:52 INFO: Done loading processors!
2021-09-29 09:47:14 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 09:47:14 INFO: Use device: gpu
2021-09-29 09:47:14 INFO: Loading: tokenize
2021-09-29 09:47:14 INFO: Loading: pos
2021-09-29 09:47:16 INFO: Done loading processors!


CPU times: user 23min 4s, sys: 3.16 s, total: 23min 7s
Wall time: 23min 6s


In [26]:
verify_prepped_data(X_train, y_train)

In [27]:
%%time
X_dev, y_dev = data_prep('../data/Dev.txt')

2021-09-29 10:09:58 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 10:09:58 INFO: Use device: gpu
2021-09-29 10:09:58 INFO: Loading: tokenize
2021-09-29 10:09:58 INFO: Done loading processors!
2021-09-29 10:10:11 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 10:10:11 INFO: Use device: gpu
2021-09-29 10:10:11 INFO: Loading: tokenize
2021-09-29 10:10:11 INFO: Loading: pos
2021-09-29 10:10:13 INFO: Done loading processors!


CPU times: user 11min 46s, sys: 1.33 s, total: 11min 48s
Wall time: 11min 47s


In [28]:
verify_prepped_data(X_dev, y_dev)

In [29]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 28min 59s, sys: 1.67 s, total: 29min 1s
Wall time: 28min 57s


In [30]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.9365331967819225


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.958     0.989     0.973    227825
           :      0.895     0.647     0.751       923
           !      0.536     0.173     0.261       347
           ,      0.796     0.633     0.705     18592
           ?      0.843     0.677     0.751       443
           ;      0.269     0.045     0.077       622
           .      0.922     0.983     0.952     11401
           -      0.516     0.091     0.154       882
          ,(      0.000     0.000     0.000         5
           "      0.610     0.128     0.212      1596
         ").      0.000     0.000     0.000         0
           (      0.964     0.873     0.916      1656
           )      0.766     0.763     0.765      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.875    



              precision    recall  f1-score   support

           :      0.895     0.647     0.751       923
           !      0.536     0.173     0.261       347
           ,      0.796     0.633     0.705     18592
           ?      0.843     0.677     0.751       443
           ;      0.269     0.045     0.077       622
           .      0.922     0.983     0.952     11401
           -      0.516     0.091     0.154       882
          ,(      0.000     0.000     0.000         5
           "      0.610     0.128     0.212      1596
         ").      0.000     0.000     0.000         0
           (      0.964     0.873     0.916      1656
           )      0.766     0.763     0.765      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.875     0.104     0.187        67
          ."      0.000    

### Add more context of words before

In [31]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos']
        })
    else:
        features.update({
            'BOS': True
        })

    if i > 1:
        features.update({
            'word_before_prev_word': sentence[i-2]['text'],
            'word_before_prev_word_upos': sentence[i-2]['upos'],
            'word_before_prev_word_xpos': sentence[i-2]['xpos']
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos']
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos']
        })

    return features

In [32]:
%%time
X_train, y_train = data_prep('../data/Train.txt')

2021-09-29 10:50:57 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 10:50:57 INFO: Use device: gpu
2021-09-29 10:50:57 INFO: Loading: tokenize
2021-09-29 10:50:57 INFO: Done loading processors!
2021-09-29 10:51:14 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 10:51:14 INFO: Use device: gpu
2021-09-29 10:51:14 INFO: Loading: tokenize
2021-09-29 10:51:14 INFO: Loading: pos
2021-09-29 10:51:15 INFO: Done loading processors!


CPU times: user 23min 14s, sys: 2.34 s, total: 23min 16s
Wall time: 23min 15s


In [33]:
verify_prepped_data(X_train, y_train)

In [34]:
%%time
X_dev, y_dev = data_prep('../data/Dev.txt')

2021-09-29 11:14:12 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 11:14:12 INFO: Use device: gpu
2021-09-29 11:14:12 INFO: Loading: tokenize
2021-09-29 11:14:12 INFO: Done loading processors!
2021-09-29 11:14:25 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 11:14:25 INFO: Use device: gpu
2021-09-29 11:14:25 INFO: Loading: tokenize
2021-09-29 11:14:25 INFO: Loading: pos
2021-09-29 11:14:27 INFO: Done loading processors!


CPU times: user 11min 47s, sys: 1.13 s, total: 11min 48s
Wall time: 11min 47s


In [35]:
verify_prepped_data(X_dev, y_dev)

In [36]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 33min 15s, sys: 2.07 s, total: 33min 18s
Wall time: 33min 13s


In [37]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.9379600791314132


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.958     0.989     0.973    227825
           :      0.915     0.657     0.765       923
           !      0.508     0.193     0.280       347
           ,      0.796     0.635     0.707     18592
           ?      0.846     0.632     0.724       443
           ;      0.274     0.047     0.080       622
           .      0.932     0.987     0.958     11401
           -      0.566     0.102     0.173       882
          ,(      0.000     0.000     0.000         5
           "      0.613     0.119     0.199      1596
         ").      0.000     0.000     0.000         0
           (      0.966     0.871     0.916      1656
           )      0.885     0.825     0.854      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.250    



              precision    recall  f1-score   support

           :      0.915     0.657     0.765       923
           !      0.508     0.193     0.280       347
           ,      0.796     0.635     0.707     18592
           ?      0.846     0.632     0.724       443
           ;      0.274     0.047     0.080       622
           .      0.932     0.987     0.958     11401
           -      0.566     0.102     0.173       882
          ,(      0.000     0.000     0.000         5
           "      0.613     0.119     0.199      1596
         ").      0.000     0.000     0.000         0
           (      0.966     0.871     0.916      1656
           )      0.885     0.825     0.854      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.250     0.015     0.028        67
          ."      0.000    

In [39]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos']
        })
    else:
        features.update({
            'BOS': True
        })

    if i > 1:
        features.update({
            'word_before_prev_word': sentence[i-2]['text'],
            'word_before_prev_word_upos': sentence[i-2]['upos'],
            'word_before_prev_word_xpos': sentence[i-2]['xpos']
        })

    if i > 2:
        features.update({
            '2_words_before_prev_word': sentence[i-3]['text'],
            '2_words_before_prev_word_upos': sentence[i-3]['upos'],
            '2_words_before_prev_word_xpos': sentence[i-3]['xpos']
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos']
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos']
        })

    return features

In [40]:
%%time
X_train, y_train = data_prep('../data/Train.txt')

2021-09-29 12:16:15 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 12:16:15 INFO: Use device: gpu
2021-09-29 12:16:15 INFO: Loading: tokenize
2021-09-29 12:16:15 INFO: Done loading processors!
2021-09-29 12:16:32 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 12:16:32 INFO: Use device: gpu
2021-09-29 12:16:32 INFO: Loading: tokenize
2021-09-29 12:16:32 INFO: Loading: pos
2021-09-29 12:16:33 INFO: Done loading processors!


CPU times: user 23min 6s, sys: 2.15 s, total: 23min 9s
Wall time: 23min 8s


In [41]:
verify_prepped_data(X_train, y_train)

In [42]:
%%time
X_dev, y_dev = data_prep('../data/Dev.txt')

2021-09-29 12:39:24 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 12:39:24 INFO: Use device: gpu
2021-09-29 12:39:24 INFO: Loading: tokenize
2021-09-29 12:39:24 INFO: Done loading processors!
2021-09-29 12:39:36 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 12:39:36 INFO: Use device: gpu
2021-09-29 12:39:36 INFO: Loading: tokenize
2021-09-29 12:39:36 INFO: Loading: pos
2021-09-29 12:39:38 INFO: Done loading processors!


CPU times: user 11min 58s, sys: 952 ms, total: 11min 59s
Wall time: 11min 58s


In [43]:
verify_prepped_data(X_dev, y_dev)

In [44]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 32min 9s, sys: 3.06 s, total: 32min 12s
Wall time: 32min 8s


In [45]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.9382267756314827


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.959     0.988     0.973    227825
           :      0.903     0.646     0.753       923
           !      0.548     0.196     0.289       347
           ,      0.788     0.644     0.709     18592
           ?      0.846     0.668     0.747       443
           ;      0.241     0.043     0.074       622
           .      0.933     0.987     0.959     11401
           -      0.562     0.103     0.174       882
          ,(      0.000     0.000     0.000         5
           "      0.616     0.113     0.192      1596
         ").      0.000     0.000     0.000         0
           (      0.966     0.877     0.919      1656
           )      0.880     0.838     0.859      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.059     0.050     0.054        20
          :"      0.500    



              precision    recall  f1-score   support

           :      0.903     0.646     0.753       923
           !      0.548     0.196     0.289       347
           ,      0.788     0.644     0.709     18592
           ?      0.846     0.668     0.747       443
           ;      0.241     0.043     0.074       622
           .      0.933     0.987     0.959     11401
           -      0.562     0.103     0.174       882
          ,(      0.000     0.000     0.000         5
           "      0.616     0.113     0.192      1596
         ").      0.000     0.000     0.000         0
           (      0.966     0.877     0.919      1656
           )      0.880     0.838     0.859      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.059     0.050     0.054        20
          :"      0.500     0.045     0.082        67
          ."      0.333    

### Add more context of words after

In [46]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos']
        })
    else:
        features.update({
            'BOS': True
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos']
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos']
        })

    if i < len(sentence)-3:
        features.update({
            '2_words_after_next_word': sentence [i+3]['text'],
            '2_words_after_next_word_upos': sentence[i+3]['upos'],
            '2_words_after_next_word_xpos': sentence[i+3]['xpos']
        })

    return features

In [47]:
%%time
X_train, y_train = data_prep('../data/Train.txt')

2021-09-29 13:23:51 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 13:23:51 INFO: Use device: gpu
2021-09-29 13:23:51 INFO: Loading: tokenize
2021-09-29 13:23:51 INFO: Done loading processors!
2021-09-29 13:24:17 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 13:24:17 INFO: Use device: gpu
2021-09-29 13:24:17 INFO: Loading: tokenize
2021-09-29 13:24:17 INFO: Loading: pos
2021-09-29 13:24:18 INFO: Done loading processors!


CPU times: user 24min 15s, sys: 2.66 s, total: 24min 17s
Wall time: 24min 16s


In [48]:
verify_prepped_data(X_train, y_train)

In [49]:
%%time
X_dev, y_dev = data_prep('../data/Dev.txt')

2021-09-29 13:48:07 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 13:48:07 INFO: Use device: gpu
2021-09-29 13:48:07 INFO: Loading: tokenize
2021-09-29 13:48:07 INFO: Done loading processors!
2021-09-29 13:48:20 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 13:48:20 INFO: Use device: gpu
2021-09-29 13:48:20 INFO: Loading: tokenize
2021-09-29 13:48:20 INFO: Loading: pos
2021-09-29 13:48:22 INFO: Done loading processors!


CPU times: user 12min, sys: 896 ms, total: 12min 1s
Wall time: 12min


In [50]:
verify_prepped_data(X_dev, y_dev)

In [51]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 29min 5s, sys: 1.5 s, total: 29min 7s
Wall time: 29min 3s


In [52]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.936269375617333


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.958     0.988     0.973    227825
           :      0.895     0.637     0.744       923
           !      0.538     0.161     0.248       347
           ,      0.793     0.636     0.705     18592
           ?      0.853     0.679     0.756       443
           ;      0.261     0.047     0.079       622
           .      0.921     0.983     0.951     11401
           -      0.542     0.087     0.150       882
          ,(      0.000     0.000     0.000         5
           "      0.601     0.120     0.200      1596
         ").      0.000     0.000     0.000         0
           (      0.969     0.867     0.915      1656
           )      0.760     0.745     0.753      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.500    



              precision    recall  f1-score   support

           :      0.895     0.637     0.744       923
           !      0.538     0.161     0.248       347
           ,      0.793     0.636     0.705     18592
           ?      0.853     0.679     0.756       443
           ;      0.261     0.047     0.079       622
           .      0.921     0.983     0.951     11401
           -      0.542     0.087     0.150       882
          ,(      0.000     0.000     0.000         5
           "      0.601     0.120     0.200      1596
         ").      0.000     0.000     0.000         0
           (      0.969     0.867     0.915      1656
           )      0.760     0.745     0.753      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.500     0.030     0.056        67
          ."      0.400    

### Check if word is starts with capital letter

In [53]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position,
        'starts_with_capital_letter': sentence[i]['text'][0].isupper()
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos'],
            'prev_word_starts_with_capital_letter': sentence[i-1]['text'][0].isupper()
        })
    else:
        features.update({
            'BOS': True
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos'],
            'next_word_starts_with_capital_letter': sentence[i+1]['text'][0].isupper()
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos'],
            'word_after_next_word_starts_with_capital_letter': sentence[i+2]['text'][0].isupper()
        })

    return features

In [54]:
%%time
X_train, y_train = data_prep('../data/Train.txt')

2021-09-29 14:54:36 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 14:54:36 INFO: Use device: gpu
2021-09-29 14:54:36 INFO: Loading: tokenize
2021-09-29 14:54:36 INFO: Done loading processors!
2021-09-29 14:54:52 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 14:54:52 INFO: Use device: gpu
2021-09-29 14:54:52 INFO: Loading: tokenize
2021-09-29 14:54:52 INFO: Loading: pos
2021-09-29 14:54:53 INFO: Done loading processors!


CPU times: user 23min 40s, sys: 2.24 s, total: 23min 42s
Wall time: 23min 41s


In [55]:
verify_prepped_data(X_train, y_train)

In [56]:
%%time
X_dev, y_dev = data_prep('../data/Dev.txt')

2021-09-29 15:18:18 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 15:18:18 INFO: Use device: gpu
2021-09-29 15:18:18 INFO: Loading: tokenize
2021-09-29 15:18:18 INFO: Done loading processors!
2021-09-29 15:18:30 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 15:18:30 INFO: Use device: gpu
2021-09-29 15:18:30 INFO: Loading: tokenize
2021-09-29 15:18:30 INFO: Loading: pos
2021-09-29 15:18:32 INFO: Done loading processors!


CPU times: user 11min 55s, sys: 996 ms, total: 11min 56s
Wall time: 11min 55s


In [57]:
verify_prepped_data(X_dev, y_dev)

In [58]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 30min 15s, sys: 1.28 s, total: 30min 16s
Wall time: 30min 13s


In [59]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.9368432416930271


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.957     0.990     0.973    227825
           :      0.911     0.711     0.799       923
           !      0.516     0.182     0.269       347
           ,      0.814     0.619     0.703     18592
           ?      0.839     0.704     0.766       443
           ;      0.248     0.043     0.074       622
           .      0.923     0.982     0.952     11401
           -      0.519     0.091     0.154       882
          ,(      0.000     0.000     0.000         5
           "      0.635     0.147     0.239      1596
         ").      0.000     0.000     0.000         0
           (      0.965     0.873     0.917      1656
           )      0.751     0.755     0.753      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.700    



              precision    recall  f1-score   support

           :      0.911     0.711     0.799       923
           !      0.516     0.182     0.269       347
           ,      0.814     0.619     0.703     18592
           ?      0.839     0.704     0.766       443
           ;      0.248     0.043     0.074       622
           .      0.923     0.982     0.952     11401
           -      0.519     0.091     0.154       882
          ,(      0.000     0.000     0.000         5
           "      0.635     0.147     0.239      1596
         ").      0.000     0.000     0.000         0
           (      0.965     0.873     0.917      1656
           )      0.751     0.755     0.753      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.700     0.104     0.182        67
          ."      0.143    

In [70]:
X_dev[30]

[{'word': 'Защото',
  'sent_len': 12,
  'pos_in_sent': 0,
  'upos': 'ADV',
  'xpos': 'Prc',
  'first_word_in_sent': 'Защото',
  'contains_interrogative_word': False,
  'interrogative_word_position': -1,
  'contains_interrogative_particle': False,
  'interrogative_particle_position': -1,
  'starts_with_capital_letter': True,
  'BOS': True,
  'next_word': 'вашата',
  'next_word_upos': 'DET',
  'next_word_xpos': 'Pshl-s2fd',
  'next_word_starts_with_capital_letter': False,
  'word_after_next_word': 'послушност',
  'word_after_next_word_upos': 'NOUN',
  'word_after_next_word_xpos': 'Ncfsi',
  'word_after_next_word_starts_with_capital_letter': False},
 {'word': 'вашата',
  'sent_len': 12,
  'pos_in_sent': 1,
  'upos': 'DET',
  'xpos': 'Pshl-s2fd',
  'first_word_in_sent': 'Защото',
  'contains_interrogative_word': False,
  'interrogative_word_position': -1,
  'contains_interrogative_particle': False,
  'interrogative_particle_position': -1,
  'starts_with_capital_letter': False,
  'prev_word