### Function Definitions

In [1]:
import json
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import classla
import json
import re

In [2]:
# input - name of a text file with one sentence per line
# output - list of sentences (strings)
def read_file_as_list_of_sentences(input_file_name):
    with open(input_file_name, "r") as input_file:
        return input_file.read().splitlines()

In [3]:
# input - list of sentences (strings) and a classla pipeline (POS tokenize or tokenize)
# output - list of dictionaries ((POS) tokenized sentences) 
def run_through_classla_pipeline(list_of_sentences, pipeline):
    return [pipeline(sentence).to_dict()[0][0] for sentence in list_of_sentences]

In [4]:
# input - list of dictionaries (a dictionary for each word - the tokenized version of the sentence)
# output - list of dictionaries (dictionaries that contain punctuation one after another are squashed into one)
def squash_punctuation(sentence):
    new_sentence = []

    for i in range(len(sentence)):
        if sentence[i]['text'] in ',()"[];.?!:-':
            if len(new_sentence) > 0 and all(character in ',()"[];.?!:-' for character in new_sentence[-1]['text']):
                new_sentence[-1]['text'] += sentence[i]['text']
            else:
                new_sentence.append(sentence[i])
        else:
            new_sentence.append(sentence[i])
            
    return new_sentence

In [5]:
# input - sentence and an index of a word (dictionary) - assigns a label to each word based on the punctuation after
# output - a label - None for punctuation, punctuation for words followed by punctuation and empty for words if they are not
def word2label(sentence, i):
    if all(character in ',()"[];.?!:-' for character in sentence[i]['text']):
        return None

    if i < len(sentence) - 1:
        if all(character in ',()"[];.?!:-' for character in sentence[i+1]['text']):
            label = sentence[i+1]['text']
            return label
    
    return ''

In [6]:
# input - sentence (list of dictionaries)
# output - list of labels (strings) with None labels for punctuation being filtered out
def sent2labels(sentence):
    return [label for label in (word2label(sentence, i) for i in range(len(sentence))) if label != None]

In [7]:
# input - list of dictionaries (a dictionary for each word - the tokenized version of the sentence)
# output - new sentence (string) with the punctuation removed
def remove_punctuation(sentence):
    new_sentence = ''

    for i in range(len(sentence)):
        if all(character in ',()"[];.?!:-' for character in sentence[i]['text']):
            pass
        else:
            new_sentence = new_sentence + sentence[i]['text'] + ' '
    
    return new_sentence

In [8]:
def contains_interrogative_word(sentence):
    for i in range(len(sentence)):
        if sentence[i]['xpos'].startswith('Pi'):
            return True
    
    return False

In [9]:
def contains_interrogative_particle(sentence):
    for i in range(len(sentence)):
        if sentence[i]['xpos'] == 'Ti':
            return True
    
    return False

In [10]:
def contains_imperative_verb(sentence):
    for i in range(len(sentence)):
        if sentence[i]['xpos'][0] == 'V' and sentence[i]['xpos'][4] == 'z':
            return True
    
    return False

In [11]:
def contains_relative_pronoun_before(sentence, i):
    for word_i in range(len(sentence[:i])):
        if sentence[word_i]['xpos'].startswith('Pr'):
            return True, word_i
    
    return False, -1

In [12]:
def contains_conj_before(sentence, i):
    for word_i in range(len(sentence[:i])):
        if sentence[word_i]['xpos'].startswith('C'):
            return True, word_i
    
    return False, -1

In [13]:
def contains_repetitive_coord_conj_before(sentence, i):
    for word_i in range(len(sentence[:i])):
        if sentence[word_i]['xpos'] in ('Cr', 'Cp'):
            return True, word_i
    
    return False, -1

In [14]:
def contains_repetitive_word_before(sentence, i):
    for word_i in range(len(sentence[:i])):
        if i < len(sentence)-1 and sentence[word_i]['text'] == sentence[i+1]['text']:
            return True, sentence[word_i]['xpos']
    
    return False, ''

In [15]:
def count_of_verbs_before(sentence, i):
    verbs_count = 0

    for word_i in range(len(sentence[:i])):
        if sentence[word_i]['upos'] == 'VERB':
            verbs_count += 1
    
    return verbs_count

In [16]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)
    sentence_contains_imperative_verb, imperative_verb_position = contains_imperative_verb(sentence)
    sentence_contains_repetitive_word_before, repetitive_word_tag = contains_repetitive_word_before(sentence, i)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position,
        'contains_imperative_verb': sentence_contains_imperative_verb,
        'imperative_verb_position': imperative_verb_position,
        'starts_with_capital_letter': sentence[i]['text'][0].isupper(),
        'contains_repetitive_word_before': sentence_contains_repetitive_word_before,
        'repetitive_word_tag': repetitive_word_tag
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos'],
            'prev_word_starts_with_capital_letter': sentence[i-1]['text'][0].isupper()
        })
    else:
        features.update({
            'BOS': True
        })

    if i > 1:
        features.update({
            'word_before_prev_word': sentence[i-2]['text'],
            'word_before_prev_word_upos': sentence[i-2]['upos'],
            'word_before_prev_word_xpos': sentence[i-2]['xpos'],
            'word_before_prev_word_starts_with_capital_letter': sentence[i-2]['text'][0].isupper()
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos'],
            'next_word_starts_with_capital_letter': sentence[i+1]['text'][0].isupper()
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos'],
            'word_after_next_word_starts_with_capital_letter': sentence[i+2]['text'][0].isupper()
        })

    return features

In [17]:
# input - sentence (list of dictionaries)
# output - list of features (dictionaries, for each word)
def sent2features(sentence):
    return [word2features(sentence, i) for i in range(len(sentence))]

In [18]:
# input - JSON-serializable data and name of the output JSON file
# output - None, saves the sentences to a JSON file
def save_as_json(data, output_file_name):
    with open(output_file_name, "w") as output_file:
        json.dump(data, output_file) 

In [19]:
# input - JSON file
# output - the contents of the JSON file as an object
def load_json(json_file_name):
    with open(json_file_name, "r") as json_file:
        return json.load(json_file)

In [20]:
# input - name of a text file with one sentence per line and a variable indicating whether or not to save X and y to JSON
# output - X and y - features and labels
def data_prep(input_file_name, json_serialize=False):
    data = read_file_as_list_of_sentences(input_file_name)
    nlp_tokenize = classla.Pipeline('bg', processors='tokenize')
    tokenized_data = run_through_classla_pipeline(data, nlp_tokenize)
    
    if len(data) != len(tokenized_data):
        print("Warning: Mismatch in the count of the data and tokenized data")

    squashed_tokenized_data = [squash_punctuation(sentence) for sentence in tokenized_data]

    if len(tokenized_data) != len(squashed_tokenized_data):
        print("Warning: Mismatch in the count of the tokenized and squashed tokenized data")
    
    y = [sent2labels(sentence) for sentence in squashed_tokenized_data]
    
    if len(squashed_tokenized_data) != len(y):
        print("Warning: Mismatch in the count of the squashed tokenized data and labeled data")
    
    data_without_punctuation = [remove_punctuation(sentence) for sentence in squashed_tokenized_data]
    
    if len(data_without_punctuation) != len(y):
        print("Warning: Mismatch in the count of the data without punctuation and labeled data")
    
    nlp_pos_tokenize = classla.Pipeline('bg', processors='tokenize,pos')   
    pos_tokenized_data = run_through_classla_pipeline(data_without_punctuation, nlp_pos_tokenize)
    
    if len(data_without_punctuation) != len(pos_tokenized_data):
        print("Warning: Mismatch in the count of the data without punctuation and POS tokenized data")
    
    X = [sent2features(sentence) for sentence in pos_tokenized_data]
    
    if len(X) != len(pos_tokenized_data):
        print("Warning: Mismatch in the count of the prepped data and POS tokenized data")
    
    if json_serialize:
        save_as_json(X, re.sub('\.txt', '_X.json', input_file_name))
        save_as_json(y, re.sub('\.txt', '_y.json', input_file_name))
        
    return X, y

In [21]:
# input - X and y - features and labels
# output - None, prints on the screen the features-labels pairs that have length mismatch
def verify_prepped_data(X, y):
    for feat, label in zip(X, y):
        if len(feat) != len(label):
            print(feat, label)

In [22]:
# input - X and y - features and labels
# output - list of punctuated sentences (strings; y labels applied to X)
def punctuate(X, y):
    punctuated_sentences = []

    for feat, label in zip(X, y):
        sentence = ''

        for i in range(len(feat)):
            sentence = sentence + feat[i]['word'] + label[i] + ' '
        
        punctuated_sentences.append(sentence.rstrip())
    
    return punctuated_sentences

### Data Prep

In [23]:
%%time
X_train, y_train = data_prep('../data/Train.txt')

2021-09-28 15:09:41 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-28 15:09:41 INFO: Use device: gpu
2021-09-28 15:09:41 INFO: Loading: tokenize
2021-09-28 15:09:41 INFO: Done loading processors!
2021-09-28 15:10:02 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-28 15:10:02 INFO: Use device: gpu
2021-09-28 15:10:02 INFO: Loading: tokenize
2021-09-28 15:10:02 INFO: Loading: pos
2021-09-28 15:10:08 INFO: Done loading processors!


CPU times: user 23min 47s, sys: 3.55 s, total: 23min 51s
Wall time: 23min 50s


In [19]:
verify_prepped_data(X_train, y_train)

In [26]:
%%time
X_dev, y_dev = data_prep('../data/Dev.txt')

2021-09-28 16:53:08 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-28 16:53:08 INFO: Use device: gpu
2021-09-28 16:53:08 INFO: Loading: tokenize
2021-09-28 16:53:08 INFO: Done loading processors!
2021-09-28 16:53:16 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-28 16:53:16 INFO: Use device: gpu
2021-09-28 16:53:16 INFO: Loading: tokenize
2021-09-28 16:53:16 INFO: Loading: pos
2021-09-28 16:53:17 INFO: Done loading processors!


CPU times: user 11min 46s, sys: 1.19 s, total: 11min 47s
Wall time: 11min 46s


In [21]:
verify_prepped_data(X_dev, y_dev)

### CRF Model

In [24]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 48min 25s, sys: 2.73 s, total: 48min 28s
Wall time: 48min 21s


In [27]:
labels = list(crf.classes_)

In [None]:
labels

In [29]:
y_pred = crf.predict(X_dev)

In [None]:
metrics.flat_f1_score(y_dev, y_pred,
                      average='weighted', labels=labels)

In [None]:
# show all metrics
# sorted_labels = sorted(
#     labels,
#     key=lambda name: (name[1:], name[0])
# )
print(metrics.flat_classification_report(
    y_dev, y_pred, labels=labels, digits=3
))

In [32]:
labels.remove('')

In [33]:
y_pred = crf.predict(X_dev)

In [None]:
metrics.flat_f1_score(y_dev, y_pred,
                      average='weighted', labels=labels)

In [None]:
print(metrics.flat_classification_report(
    y_dev, y_pred, labels=labels, digits=3
))

In [31]:
punctuated_sentences = punctuate(X_dev, y_pred)

In [None]:
punctuated_sentences[:100]

In [None]:
punctuated_sentences[:100]

In [49]:
correct_punctuated_sentences = punctuate(X_dev, y_dev)

In [50]:
correct_punctuated_sentences[:100]

['Но синовете им не умъртви, защото постъпи, според писаното в закона, в книгата на Мойсей, където ГОСПОД заповяда и каза: Бащите да не умират заради синовете и синовете да не умират заради бащите, а всеки да умира за собствения си грях.',
 'Но когато ти говоря, ще отворя устата ти и ти им кажи: Така казва Господ БОГ: Който слуша, нека слуша, а който не слуша, нека не слуша- защото са бунтовен дом.',
 'Защото кой е по-голям: този, който седи на трапезата ли, или онзи, който слугува?',
 'А след това ще се насели както в предишните дни, заявява ГОСПОД.',
 'А Той беше на задната част, заспал на възглавница.',
 'Ефрем е като птица, славата му ще отлети.',
 'И така, той си отиде от него на известно разстояние.',
 'Не бягам при халдейците.',
 'И израилевите синове излязоха от египетската земя строени.',
 'А когато Ахитофел видя, че съветът му не се изпълни, оседла магарето си и стана, и отиде у дома си, в своя град, и нареди домашните си работи, и се обеси.',
 'Ако умре някое животно от доби

### Grid Search Try - Worse Results

In [None]:
# from sklearn.metrics import make_scorer
# import scipy.stats
# from sklearn.model_selection import RandomizedSearchCV

In [None]:
# %%time
# # define fixed parameters and parameters to search
# crf = sklearn_crfsuite.CRF(
#     algorithm='lbfgs',
#     max_iterations=100,
#     all_possible_transitions=True
# )
# params_space = {
#     'c1': scipy.stats.expon(scale=0.5),
#     'c2': scipy.stats.expon(scale=0.05),
# }

# # use the same metric for evaluation
# f1_scorer = make_scorer(metrics.flat_f1_score,
#                         average='weighted', labels=labels)

# # search
# rs = RandomizedSearchCV(crf, params_space,
#                         cv=3,
#                         verbose=1,
#                         n_jobs=-1,
#                         n_iter=50,
#                         scoring=f1_scorer)
# rs.fit(X, y)

In [None]:
# crf = rs.best_estimator_
# print('best params:', rs.best_params_)
# print('best CV score:', rs.best_score_)
# print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

In [None]:
# crf = rs.best_estimator_
# y_pred = crf.predict(X_test)

# sorted_labels = sorted(
#     labels,
#     key=lambda name: (name[1:], name[0])
# )

# print(metrics.flat_classification_report(
#     y_test, y_pred, labels=sorted_labels, digits=3
# ))

### Sentence contains imperative verb

In [23]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)
    sentence_contains_imperative_verb, imperative_verb_position = contains_imperative_verb(sentence)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position,
        'contains_imperative_verb': sentence_contains_imperative_verb,
        'imperative_verb_position': imperative_verb_position
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos']
        })
    else:
        features.update({
            'BOS': True
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos']
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos']
        })

    return features

In [24]:
%%time
X_train, y_train = data_prep('../data/Train.txt')

2021-09-29 00:35:06 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 00:35:06 INFO: Use device: gpu
2021-09-29 00:35:06 INFO: Loading: tokenize
2021-09-29 00:35:06 INFO: Done loading processors!
2021-09-29 00:35:30 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 00:35:30 INFO: Use device: gpu
2021-09-29 00:35:30 INFO: Loading: tokenize
2021-09-29 00:35:30 INFO: Loading: pos
2021-09-29 00:35:36 INFO: Done loading processors!


CPU times: user 23min 14s, sys: 3.57 s, total: 23min 18s
Wall time: 23min 17s


In [25]:
verify_prepped_data(X_train, y_train)

In [26]:
%%time
X_dev, y_dev = data_prep('../data/Dev.txt')

2021-09-29 00:58:24 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 00:58:24 INFO: Use device: gpu
2021-09-29 00:58:24 INFO: Loading: tokenize
2021-09-29 00:58:24 INFO: Done loading processors!
2021-09-29 00:58:37 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 00:58:37 INFO: Use device: gpu
2021-09-29 00:58:37 INFO: Loading: tokenize
2021-09-29 00:58:37 INFO: Loading: pos
2021-09-29 00:58:38 INFO: Done loading processors!


CPU times: user 11min 49s, sys: 1.09 s, total: 11min 50s
Wall time: 11min 49s


In [27]:
verify_prepped_data(X_dev, y_dev)

In [28]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 29min 3s, sys: 1.64 s, total: 29min 4s
Wall time: 29min 1s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Sentence contains relative pronoun

In [30]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)
    sentence_contains_relative_pronoun, relative_pronoun_position = contains_relative_pronoun(sentence)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position,
        'contains_relative_pronoun': sentence_contains_relative_pronoun,
        'relative_pronoun_position': relative_pronoun_position
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos']
        })
    else:
        features.update({
            'BOS': True
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos']
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos']
        })

    return features

In [31]:
%%time
X_train, y_train = data_prep('../data/Train.txt')

2021-09-29 01:39:26 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 01:39:26 INFO: Use device: gpu
2021-09-29 01:39:26 INFO: Loading: tokenize
2021-09-29 01:39:26 INFO: Done loading processors!
2021-09-29 01:39:44 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 01:39:44 INFO: Use device: gpu
2021-09-29 01:39:44 INFO: Loading: tokenize
2021-09-29 01:39:44 INFO: Loading: pos
2021-09-29 01:39:45 INFO: Done loading processors!


CPU times: user 23min 16s, sys: 2.07 s, total: 23min 18s
Wall time: 23min 17s


In [32]:
verify_prepped_data(X_train, y_train)

In [33]:
%%time
X_dev, y_dev = data_prep('../data/Dev.txt')

2021-09-29 02:02:44 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 02:02:44 INFO: Use device: gpu
2021-09-29 02:02:44 INFO: Loading: tokenize
2021-09-29 02:02:44 INFO: Done loading processors!
2021-09-29 02:02:56 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 02:02:56 INFO: Use device: gpu
2021-09-29 02:02:56 INFO: Loading: tokenize
2021-09-29 02:02:56 INFO: Loading: pos
2021-09-29 02:02:58 INFO: Done loading processors!


CPU times: user 11min 47s, sys: 1.06 s, total: 11min 48s
Wall time: 11min 47s


In [34]:
verify_prepped_data(X_dev, y_dev)

In [35]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 30min 24s, sys: 1.36 s, total: 30min 25s
Wall time: 30min 22s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Contains repetitive coordinative conjunction

In [24]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)
    sentence_contains_repetitive_coord_conj, repetitive_coord_conj_position = contains_repetitive_coord_conj_before(sentence, i)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position,
        'contains_repetitive_coord_conj_before': sentence_contains_repetitive_coord_conj,
        'repetitive_coord_conj_position': repetitive_coord_conj_position
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos']
        })
    else:
        features.update({
            'BOS': True
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos']
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos']
        })

    return features

In [25]:
%%time
X_train, y_train = data_prep('../data/Train.txt')

2021-09-29 09:46:52 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 09:46:52 INFO: Use device: gpu
2021-09-29 09:46:52 INFO: Loading: tokenize
2021-09-29 09:46:52 INFO: Done loading processors!
2021-09-29 09:47:14 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 09:47:14 INFO: Use device: gpu
2021-09-29 09:47:14 INFO: Loading: tokenize
2021-09-29 09:47:14 INFO: Loading: pos
2021-09-29 09:47:16 INFO: Done loading processors!


CPU times: user 23min 4s, sys: 3.16 s, total: 23min 7s
Wall time: 23min 6s


In [26]:
verify_prepped_data(X_train, y_train)

In [27]:
%%time
X_dev, y_dev = data_prep('../data/Dev.txt')

2021-09-29 10:09:58 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 10:09:58 INFO: Use device: gpu
2021-09-29 10:09:58 INFO: Loading: tokenize
2021-09-29 10:09:58 INFO: Done loading processors!
2021-09-29 10:10:11 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 10:10:11 INFO: Use device: gpu
2021-09-29 10:10:11 INFO: Loading: tokenize
2021-09-29 10:10:11 INFO: Loading: pos
2021-09-29 10:10:13 INFO: Done loading processors!


CPU times: user 11min 46s, sys: 1.33 s, total: 11min 48s
Wall time: 11min 47s


In [28]:
verify_prepped_data(X_dev, y_dev)

In [29]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 28min 59s, sys: 1.67 s, total: 29min 1s
Wall time: 28min 57s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Add more context of words before

In [31]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos']
        })
    else:
        features.update({
            'BOS': True
        })

    if i > 1:
        features.update({
            'word_before_prev_word': sentence[i-2]['text'],
            'word_before_prev_word_upos': sentence[i-2]['upos'],
            'word_before_prev_word_xpos': sentence[i-2]['xpos']
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos']
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos']
        })

    return features

In [32]:
%%time
X_train, y_train = data_prep('../data/Train.txt')

2021-09-29 10:50:57 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 10:50:57 INFO: Use device: gpu
2021-09-29 10:50:57 INFO: Loading: tokenize
2021-09-29 10:50:57 INFO: Done loading processors!
2021-09-29 10:51:14 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 10:51:14 INFO: Use device: gpu
2021-09-29 10:51:14 INFO: Loading: tokenize
2021-09-29 10:51:14 INFO: Loading: pos
2021-09-29 10:51:15 INFO: Done loading processors!


CPU times: user 23min 14s, sys: 2.34 s, total: 23min 16s
Wall time: 23min 15s


In [33]:
verify_prepped_data(X_train, y_train)

In [34]:
%%time
X_dev, y_dev = data_prep('../data/Dev.txt')

2021-09-29 11:14:12 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 11:14:12 INFO: Use device: gpu
2021-09-29 11:14:12 INFO: Loading: tokenize
2021-09-29 11:14:12 INFO: Done loading processors!
2021-09-29 11:14:25 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 11:14:25 INFO: Use device: gpu
2021-09-29 11:14:25 INFO: Loading: tokenize
2021-09-29 11:14:25 INFO: Loading: pos
2021-09-29 11:14:27 INFO: Done loading processors!


CPU times: user 11min 47s, sys: 1.13 s, total: 11min 48s
Wall time: 11min 47s


In [35]:
verify_prepped_data(X_dev, y_dev)

In [36]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 33min 15s, sys: 2.07 s, total: 33min 18s
Wall time: 33min 13s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

In [39]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos']
        })
    else:
        features.update({
            'BOS': True
        })

    if i > 1:
        features.update({
            'word_before_prev_word': sentence[i-2]['text'],
            'word_before_prev_word_upos': sentence[i-2]['upos'],
            'word_before_prev_word_xpos': sentence[i-2]['xpos']
        })

    if i > 2:
        features.update({
            '2_words_before_prev_word': sentence[i-3]['text'],
            '2_words_before_prev_word_upos': sentence[i-3]['upos'],
            '2_words_before_prev_word_xpos': sentence[i-3]['xpos']
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos']
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos']
        })

    return features

In [40]:
%%time
X_train, y_train = data_prep('../data/Train.txt')

2021-09-29 12:16:15 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 12:16:15 INFO: Use device: gpu
2021-09-29 12:16:15 INFO: Loading: tokenize
2021-09-29 12:16:15 INFO: Done loading processors!
2021-09-29 12:16:32 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 12:16:32 INFO: Use device: gpu
2021-09-29 12:16:32 INFO: Loading: tokenize
2021-09-29 12:16:32 INFO: Loading: pos
2021-09-29 12:16:33 INFO: Done loading processors!


CPU times: user 23min 6s, sys: 2.15 s, total: 23min 9s
Wall time: 23min 8s


In [41]:
verify_prepped_data(X_train, y_train)

In [42]:
%%time
X_dev, y_dev = data_prep('../data/Dev.txt')

2021-09-29 12:39:24 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 12:39:24 INFO: Use device: gpu
2021-09-29 12:39:24 INFO: Loading: tokenize
2021-09-29 12:39:24 INFO: Done loading processors!
2021-09-29 12:39:36 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 12:39:36 INFO: Use device: gpu
2021-09-29 12:39:36 INFO: Loading: tokenize
2021-09-29 12:39:36 INFO: Loading: pos
2021-09-29 12:39:38 INFO: Done loading processors!


CPU times: user 11min 58s, sys: 952 ms, total: 11min 59s
Wall time: 11min 58s


In [43]:
verify_prepped_data(X_dev, y_dev)

In [44]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 32min 9s, sys: 3.06 s, total: 32min 12s
Wall time: 32min 8s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Add more context of words after

In [46]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos']
        })
    else:
        features.update({
            'BOS': True
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos']
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos']
        })

    if i < len(sentence)-3:
        features.update({
            '2_words_after_next_word': sentence [i+3]['text'],
            '2_words_after_next_word_upos': sentence[i+3]['upos'],
            '2_words_after_next_word_xpos': sentence[i+3]['xpos']
        })

    return features

In [47]:
%%time
X_train, y_train = data_prep('../data/Train.txt')

2021-09-29 13:23:51 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 13:23:51 INFO: Use device: gpu
2021-09-29 13:23:51 INFO: Loading: tokenize
2021-09-29 13:23:51 INFO: Done loading processors!
2021-09-29 13:24:17 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 13:24:17 INFO: Use device: gpu
2021-09-29 13:24:17 INFO: Loading: tokenize
2021-09-29 13:24:17 INFO: Loading: pos
2021-09-29 13:24:18 INFO: Done loading processors!


CPU times: user 24min 15s, sys: 2.66 s, total: 24min 17s
Wall time: 24min 16s


In [48]:
verify_prepped_data(X_train, y_train)

In [49]:
%%time
X_dev, y_dev = data_prep('../data/Dev.txt')

2021-09-29 13:48:07 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 13:48:07 INFO: Use device: gpu
2021-09-29 13:48:07 INFO: Loading: tokenize
2021-09-29 13:48:07 INFO: Done loading processors!
2021-09-29 13:48:20 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 13:48:20 INFO: Use device: gpu
2021-09-29 13:48:20 INFO: Loading: tokenize
2021-09-29 13:48:20 INFO: Loading: pos
2021-09-29 13:48:22 INFO: Done loading processors!


CPU times: user 12min, sys: 896 ms, total: 12min 1s
Wall time: 12min


In [50]:
verify_prepped_data(X_dev, y_dev)

In [51]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 29min 5s, sys: 1.5 s, total: 29min 7s
Wall time: 29min 3s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Check if word is starts with capital letter

In [53]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position,
        'starts_with_capital_letter': sentence[i]['text'][0].isupper()
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos'],
            'prev_word_starts_with_capital_letter': sentence[i-1]['text'][0].isupper()
        })
    else:
        features.update({
            'BOS': True
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos'],
            'next_word_starts_with_capital_letter': sentence[i+1]['text'][0].isupper()
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos'],
            'word_after_next_word_starts_with_capital_letter': sentence[i+2]['text'][0].isupper()
        })

    return features

In [54]:
%%time
X_train, y_train = data_prep('../data/Train.txt')

2021-09-29 14:54:36 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 14:54:36 INFO: Use device: gpu
2021-09-29 14:54:36 INFO: Loading: tokenize
2021-09-29 14:54:36 INFO: Done loading processors!
2021-09-29 14:54:52 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 14:54:52 INFO: Use device: gpu
2021-09-29 14:54:52 INFO: Loading: tokenize
2021-09-29 14:54:52 INFO: Loading: pos
2021-09-29 14:54:53 INFO: Done loading processors!


CPU times: user 23min 40s, sys: 2.24 s, total: 23min 42s
Wall time: 23min 41s


In [55]:
verify_prepped_data(X_train, y_train)

In [56]:
%%time
X_dev, y_dev = data_prep('../data/Dev.txt')

2021-09-29 15:18:18 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 15:18:18 INFO: Use device: gpu
2021-09-29 15:18:18 INFO: Loading: tokenize
2021-09-29 15:18:18 INFO: Done loading processors!
2021-09-29 15:18:30 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 15:18:30 INFO: Use device: gpu
2021-09-29 15:18:30 INFO: Loading: tokenize
2021-09-29 15:18:30 INFO: Loading: pos
2021-09-29 15:18:32 INFO: Done loading processors!


CPU times: user 11min 55s, sys: 996 ms, total: 11min 56s
Wall time: 11min 55s


In [57]:
verify_prepped_data(X_dev, y_dev)

In [58]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 30min 15s, sys: 1.28 s, total: 30min 16s
Wall time: 30min 13s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Contains relative pronoun before

In [77]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)
    sentence_contains_relative_pronoun_before, relative_pronoun_position = contains_relative_pronoun_before(sentence, i)
    
    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position,
        'contains_relative_pronoun_before': sentence_contains_relative_pronoun_before,
        'relative_pronoun_position': relative_pronoun_position
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos']
        })
    else:
        features.update({
            'BOS': True
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos']
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos']
        })

    return features

In [78]:
%%time
X_train, y_train = data_prep('../data/Train.txt')

2021-09-29 20:23:44 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 20:23:44 INFO: Use device: gpu
2021-09-29 20:23:44 INFO: Loading: tokenize
2021-09-29 20:23:44 INFO: Done loading processors!
2021-09-29 20:24:01 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 20:24:01 INFO: Use device: gpu
2021-09-29 20:24:01 INFO: Loading: tokenize
2021-09-29 20:24:01 INFO: Loading: pos
2021-09-29 20:24:01 INFO: Done loading processors!


CPU times: user 24min 45s, sys: 3.38 s, total: 24min 48s
Wall time: 24min 47s


In [79]:
verify_prepped_data(X_train, y_train)

In [80]:
%%time
X_dev, y_dev = data_prep('../data/Dev.txt')

2021-09-29 20:48:32 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 20:48:32 INFO: Use device: gpu
2021-09-29 20:48:32 INFO: Loading: tokenize
2021-09-29 20:48:32 INFO: Done loading processors!
2021-09-29 20:48:45 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 20:48:45 INFO: Use device: gpu
2021-09-29 20:48:45 INFO: Loading: tokenize
2021-09-29 20:48:45 INFO: Loading: pos
2021-09-29 20:48:47 INFO: Done loading processors!


CPU times: user 11min 57s, sys: 1.15 s, total: 11min 58s
Wall time: 11min 57s


In [81]:
verify_prepped_data(X_dev, y_dev)

In [82]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 38min 32s, sys: 2.04 s, total: 38min 34s
Wall time: 38min 28s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Contains conjunction before

In [84]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)
    sentence_contains_conj_before, conj_position = contains_conj_before(sentence, i)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position,
        'contains_conj_before': sentence_contains_conj_before,
        'conj_position': conj_position
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos']
        })
    else:
        features.update({
            'BOS': True
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos']
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos']
        })

    return features

In [85]:
%%time
X_train, y_train = data_prep('../data/Train.txt')

2021-09-29 21:39:18 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 21:39:18 INFO: Use device: gpu
2021-09-29 21:39:18 INFO: Loading: tokenize
2021-09-29 21:39:18 INFO: Done loading processors!
2021-09-29 21:39:44 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 21:39:44 INFO: Use device: gpu
2021-09-29 21:39:44 INFO: Loading: tokenize
2021-09-29 21:39:44 INFO: Loading: pos
2021-09-29 21:39:46 INFO: Done loading processors!


CPU times: user 24min 54s, sys: 2.29 s, total: 24min 56s
Wall time: 24min 56s


In [86]:
verify_prepped_data(X_train, y_train)

In [87]:
%%time
X_dev, y_dev = data_prep('../data/Dev.txt')

2021-09-29 22:04:14 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 22:04:14 INFO: Use device: gpu
2021-09-29 22:04:14 INFO: Loading: tokenize
2021-09-29 22:04:14 INFO: Done loading processors!
2021-09-29 22:04:26 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 22:04:26 INFO: Use device: gpu
2021-09-29 22:04:26 INFO: Loading: tokenize
2021-09-29 22:04:26 INFO: Loading: pos
2021-09-29 22:04:28 INFO: Done loading processors!


CPU times: user 12min 34s, sys: 1.11 s, total: 12min 35s
Wall time: 12min 34s


In [88]:
verify_prepped_data(X_dev, y_dev)

In [89]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 41min 52s, sys: 3.25 s, total: 41min 55s
Wall time: 41min 49s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Model combining the best features

In [91]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)
    sentence_contains_imperative_verb, imperative_verb_position = contains_imperative_verb(sentence)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position,
        'contains_imperative_verb': sentence_contains_imperative_verb,
        'imperative_verb_position': imperative_verb_position,
        'starts_with_capital_letter': sentence[i]['text'][0].isupper()
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos'],
            'prev_word_starts_with_capital_letter': sentence[i-1]['text'][0].isupper()
        })
    else:
        features.update({
            'BOS': True
        })

    if i > 1:
        features.update({
            'word_before_prev_word': sentence[i-2]['text'],
            'word_before_prev_word_upos': sentence[i-2]['upos'],
            'word_before_prev_word_xpos': sentence[i-2]['xpos'],
            'word_before_prev_word_starts_with_capital_letter': sentence[i-2]['text'][0].isupper()
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos'],
            'next_word_starts_with_capital_letter': sentence[i+1]['text'][0].isupper()
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos'],
            'word_after_next_word_starts_with_capital_letter': sentence[i+2]['text'][0].isupper()
        })

    return features

In [92]:
%%time
X_train, y_train = data_prep('../data/Train.txt')

2021-09-29 23:07:29 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 23:07:29 INFO: Use device: gpu
2021-09-29 23:07:29 INFO: Loading: tokenize
2021-09-29 23:07:29 INFO: Done loading processors!
2021-09-29 23:07:52 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 23:07:52 INFO: Use device: gpu
2021-09-29 23:07:52 INFO: Loading: tokenize
2021-09-29 23:07:52 INFO: Loading: pos
2021-09-29 23:07:54 INFO: Done loading processors!


CPU times: user 24min 40s, sys: 2.26 s, total: 24min 43s
Wall time: 24min 42s


In [93]:
verify_prepped_data(X_train, y_train)

In [94]:
%%time
X_dev, y_dev = data_prep('../data/Dev.txt')

2021-09-29 23:32:11 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-29 23:32:11 INFO: Use device: gpu
2021-09-29 23:32:11 INFO: Loading: tokenize
2021-09-29 23:32:11 INFO: Done loading processors!
2021-09-29 23:32:24 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-29 23:32:24 INFO: Use device: gpu
2021-09-29 23:32:24 INFO: Loading: tokenize
2021-09-29 23:32:24 INFO: Loading: pos
2021-09-29 23:32:26 INFO: Done loading processors!


CPU times: user 12min 27s, sys: 1.1 s, total: 12min 28s
Wall time: 12min 27s


In [95]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 45min 25s, sys: 2.87 s, total: 45min 28s
Wall time: 45min 22s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

In [97]:
save_as_json(X_dev, '../data/X_dev.json')

In [99]:
save_as_json(y_dev, '../data/y_dev.json')

### Expand train set

In [21]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)
    sentence_contains_imperative_verb, imperative_verb_position = contains_imperative_verb(sentence)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position,
        'contains_imperative_verb': sentence_contains_imperative_verb,
        'imperative_verb_position': imperative_verb_position,
        'starts_with_capital_letter': sentence[i]['text'][0].isupper()
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos'],
            'prev_word_starts_with_capital_letter': sentence[i-1]['text'][0].isupper()
        })
    else:
        features.update({
            'BOS': True
        })

    if i > 1:
        features.update({
            'word_before_prev_word': sentence[i-2]['text'],
            'word_before_prev_word_upos': sentence[i-2]['upos'],
            'word_before_prev_word_xpos': sentence[i-2]['xpos'],
            'word_before_prev_word_starts_with_capital_letter': sentence[i-2]['text'][0].isupper()
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos'],
            'next_word_starts_with_capital_letter': sentence[i+1]['text'][0].isupper()
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos'],
            'word_after_next_word_starts_with_capital_letter': sentence[i+2]['text'][0].isupper()
        })

    return features

In [22]:
X_dev = load_json('../data/X_dev.json')

In [23]:
y_dev = load_json('../data/y_dev.json')

In [24]:
verify_prepped_data(X_dev, y_dev)

### Expand train set to 30% of data

In [26]:
%%time
X_train, y_train = data_prep('../data/Train_30.txt')

2021-09-30 01:15:42 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-30 01:15:42 INFO: Use device: gpu
2021-09-30 01:15:42 INFO: Loading: tokenize
2021-09-30 01:15:42 INFO: Done loading processors!
2021-09-30 01:16:13 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-30 01:16:13 INFO: Use device: gpu
2021-09-30 01:16:13 INFO: Loading: tokenize
2021-09-30 01:16:13 INFO: Loading: pos
2021-09-30 01:16:18 INFO: Done loading processors!


CPU times: user 33min 54s, sys: 4.56 s, total: 33min 58s
Wall time: 33min 58s


In [27]:
verify_prepped_data(X_train, y_train)

In [28]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 58min 38s, sys: 2.54 s, total: 58min 41s
Wall time: 58min 34s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Expand train set to 50% of data

In [30]:
%%time
X_train, y_train = data_prep('../data/Train_50.txt')

2021-09-30 02:48:30 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-30 02:48:30 INFO: Use device: gpu
2021-09-30 02:48:30 INFO: Loading: tokenize
2021-09-30 02:48:30 INFO: Done loading processors!
2021-09-30 02:49:11 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-30 02:49:11 INFO: Use device: gpu
2021-09-30 02:49:11 INFO: Loading: tokenize
2021-09-30 02:49:11 INFO: Loading: pos
2021-09-30 02:49:12 INFO: Done loading processors!


CPU times: user 57min 28s, sys: 6.39 s, total: 57min 34s
Wall time: 57min 32s


In [31]:
verify_prepped_data(X_train, y_train)

In [32]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 1h 55min 54s, sys: 3.86 s, total: 1h 55min 58s
Wall time: 1h 55min 45s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Expand train set to 60% of data

In [34]:
%%time
X_train, y_train = data_prep('../data/Train_60.txt')

2021-09-30 05:42:06 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-30 05:42:06 INFO: Use device: gpu
2021-09-30 05:42:06 INFO: Loading: tokenize
2021-09-30 05:42:06 INFO: Done loading processors!
2021-09-30 05:42:55 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-30 05:42:55 INFO: Use device: gpu
2021-09-30 05:42:55 INFO: Loading: tokenize
2021-09-30 05:42:55 INFO: Loading: pos
2021-09-30 05:42:57 INFO: Done loading processors!


CPU times: user 1h 9min 9s, sys: 6.31 s, total: 1h 9min 15s
Wall time: 1h 9min 14s


In [35]:
verify_prepped_data(X_train, y_train)

In [36]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 3h 2min 3s, sys: 6.58 s, total: 3h 2min 9s
Wall time: 3h 1min 49s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Expand train set to 70% of data

In [25]:
%%time
X_train, y_train = data_prep('../data/Train_70.txt')

2021-09-30 12:02:25 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-30 12:02:25 INFO: Use device: gpu
2021-09-30 12:02:25 INFO: Loading: tokenize
2021-09-30 12:02:25 INFO: Done loading processors!
2021-09-30 12:03:25 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-30 12:03:25 INFO: Use device: gpu
2021-09-30 12:03:25 INFO: Loading: tokenize
2021-09-30 12:03:25 INFO: Loading: pos
2021-09-30 12:03:28 INFO: Done loading processors!


CPU times: user 1h 20min 57s, sys: 8.48 s, total: 1h 21min 6s
Wall time: 1h 21min 4s


In [26]:
verify_prepped_data(X_train, y_train)

In [27]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 3h 20min 38s, sys: 7.05 s, total: 3h 20min 45s
Wall time: 3h 20min 24s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Check for repetitive word before

In [25]:
%%time
X_train, y_train = data_prep('../data/Train.txt')

2021-09-30 20:21:14 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-30 20:21:14 INFO: Use device: gpu
2021-09-30 20:21:14 INFO: Loading: tokenize
2021-09-30 20:21:14 INFO: Done loading processors!
2021-09-30 20:21:39 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-30 20:21:39 INFO: Use device: gpu
2021-09-30 20:21:39 INFO: Loading: tokenize
2021-09-30 20:21:39 INFO: Loading: pos
2021-09-30 20:21:41 INFO: Done loading processors!


CPU times: user 23min 56s, sys: 3.09 s, total: 23min 59s
Wall time: 23min 58s


In [27]:
verify_prepped_data(X_train, y_train)

In [28]:
%%time
X_dev, y_dev = data_prep('../data/Dev.txt')

2021-09-30 20:49:20 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-30 20:49:20 INFO: Use device: gpu
2021-09-30 20:49:20 INFO: Loading: tokenize
2021-09-30 20:49:20 INFO: Done loading processors!
2021-09-30 20:49:33 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-30 20:49:33 INFO: Use device: gpu
2021-09-30 20:49:33 INFO: Loading: tokenize
2021-09-30 20:49:33 INFO: Loading: pos
2021-09-30 20:49:35 INFO: Done loading processors!


CPU times: user 11min 37s, sys: 1.31 s, total: 11min 38s
Wall time: 11min 38s


In [29]:
verify_prepped_data(X_dev, y_dev)

In [30]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 37min 46s, sys: 2.11 s, total: 37min 48s
Wall time: 37min 44s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Check for number of verbs before

In [74]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)
    sentence_contains_imperative_verb, imperative_verb_position = contains_imperative_verb(sentence)
    sentence_contains_repetitive_word_before, repetitive_word_tag = contains_repetitive_word_before(sentence, i)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position,
        'contains_imperative_verb': sentence_contains_imperative_verb,
        'imperative_verb_position': imperative_verb_position,
        'starts_with_capital_letter': sentence[i]['text'][0].isupper(),
        'contains_repetitive_word_before': sentence_contains_repetitive_word_before,
        'repetitive_word_tag': repetitive_word_tag,
        'count_of_verbs_before': count_of_verbs_before(sentence, i)
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos'],
            'prev_word_starts_with_capital_letter': sentence[i-1]['text'][0].isupper()
        })
    else:
        features.update({
            'BOS': True
        })

    if i > 1:
        features.update({
            'word_before_prev_word': sentence[i-2]['text'],
            'word_before_prev_word_upos': sentence[i-2]['upos'],
            'word_before_prev_word_xpos': sentence[i-2]['xpos'],
            'word_before_prev_word_starts_with_capital_letter': sentence[i-2]['text'][0].isupper()
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos'],
            'next_word_starts_with_capital_letter': sentence[i+1]['text'][0].isupper()
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos'],
            'word_after_next_word_starts_with_capital_letter': sentence[i+2]['text'][0].isupper()
        })

    return features

In [75]:
%%time
X_train, y_train = data_prep('../data/Train.txt')

2021-09-30 22:35:58 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-30 22:35:58 INFO: Use device: gpu
2021-09-30 22:35:58 INFO: Loading: tokenize
2021-09-30 22:35:58 INFO: Done loading processors!
2021-09-30 22:36:16 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-30 22:36:16 INFO: Use device: gpu
2021-09-30 22:36:16 INFO: Loading: tokenize
2021-09-30 22:36:16 INFO: Loading: pos
2021-09-30 22:36:18 INFO: Done loading processors!


CPU times: user 24min 6s, sys: 2.42 s, total: 24min 8s
Wall time: 24min 7s


In [76]:
verify_prepped_data(X_train, y_train)

In [77]:
%%time
X_dev, y_dev = data_prep('../data/Dev.txt')

2021-09-30 23:00:06 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-30 23:00:06 INFO: Use device: gpu
2021-09-30 23:00:06 INFO: Loading: tokenize
2021-09-30 23:00:06 INFO: Done loading processors!
2021-09-30 23:00:19 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-30 23:00:19 INFO: Use device: gpu
2021-09-30 23:00:19 INFO: Loading: tokenize
2021-09-30 23:00:19 INFO: Loading: pos
2021-09-30 23:00:21 INFO: Done loading processors!


CPU times: user 11min 58s, sys: 1.18 s, total: 11min 59s
Wall time: 11min 59s


In [78]:
verify_prepped_data(X_dev, y_dev)

In [79]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 46min 27s, sys: 2.71 s, total: 46min 30s
Wall time: 46min 24s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Check results with some features excluded

In [23]:
X_train = load_json('../data/X_train.json')

In [24]:
y_train = load_json('../data/y_train.json')

In [25]:
X_dev = load_json('../data/X_dev.json')

In [26]:
y_dev = load_json('../data/y_dev.json')

In [28]:
X_dev_without_first_word_in_sent = [[{feature:word[feature] for feature in word if feature not in ('first_word_in_sent', 'count_of_verbs_before')} for word in sentence] for sentence in X_dev]
X_dev_without_sent_len = [[{feature:word[feature] for feature in word if feature not in ('sent_len', 'count_of_verbs_before')} for word in sentence] for sentence in X_dev]
X_dev_without_pos = [[{feature:word[feature] for feature in word if feature not in ('pos_in_sent', 'interrogative_word_position', 'interrogative_particle_position', 'imperative_verb_position', 'count_of_verbs_before')} for word in sentence] for sentence in X_dev]
X_dev_without_upos = [[{feature:word[feature] for feature in word if feature not in ('upos', 'prev_word_upos', 'word_before_prev_word_upos', 'next_word_upos', 'word_after_next_word_upos', 'count_of_verbs_before')} for word in sentence] for sentence in X_dev]

In [29]:
X_train_without_first_word_in_sent = [[{feature:word[feature] for feature in word if feature not in ('first_word_in_sent', 'count_of_verbs_before')} for word in sentence] for sentence in X_train]
X_train_without_sent_len = [[{feature:word[feature] for feature in word if feature not in ('sent_len', 'count_of_verbs_before')} for word in sentence] for sentence in X_train]
X_train_without_pos = [[{feature:word[feature] for feature in word if feature not in ('pos_in_sent', 'interrogative_word_position', 'interrogative_particle_position', 'imperative_verb_position', 'count_of_verbs_before')} for word in sentence] for sentence in X_train]
X_train_without_upos = [[{feature:word[feature] for feature in word if feature not in ('upos', 'prev_word_upos', 'word_before_prev_word_upos', 'next_word_upos', 'word_after_next_word_upos', 'count_of_verbs_before')} for word in sentence] for sentence in X_train]

In [38]:
verify_prepped_data(X_dev_without_first_word_in_sent, y_dev)

In [39]:
verify_prepped_data(X_dev_without_sent_len, y_dev)

In [41]:
verify_prepped_data(X_dev_without_pos, y_dev)

In [42]:
verify_prepped_data(X_dev_without_upos, y_dev)

In [43]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train_without_first_word_in_sent, y_train)

CPU times: user 30min 59s, sys: 999 ms, total: 31min
Wall time: 30min 58s


In [44]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev_without_first_word_in_sent)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev_without_first_word_in_sent, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.9390237683618073


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.959     0.989     0.974    227825
           :      0.893     0.713     0.793       923
           !      0.429     0.156     0.228       347
           ,      0.799     0.639     0.710     18592
           ?      0.828     0.630     0.715       443
           ;      0.221     0.037     0.063       622
           .      0.931     0.986     0.958     11401
           -      0.563     0.111     0.186       882
          ,(      0.000     0.000     0.000         5
           "      0.636     0.140     0.230      1596
         ").      0.000     0.000     0.000         0
           (      0.965     0.883     0.922      1656
           )      0.880     0.846     0.863      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.636    



              precision    recall  f1-score   support

           :      0.893     0.713     0.793       923
           !      0.429     0.156     0.228       347
           ,      0.799     0.639     0.710     18592
           ?      0.828     0.630     0.715       443
           ;      0.221     0.037     0.063       622
           .      0.931     0.986     0.958     11401
           -      0.563     0.111     0.186       882
          ,(      0.000     0.000     0.000         5
           "      0.636     0.140     0.230      1596
         ").      0.000     0.000     0.000         0
           (      0.965     0.883     0.922      1656
           )      0.880     0.846     0.863      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.636     0.104     0.179        67
          ."      0.167    

In [45]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train_without_sent_len, y_train)

CPU times: user 28min 13s, sys: 924 ms, total: 28min 14s
Wall time: 28min 11s


In [46]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev_without_sent_len)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev_without_sent_len, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.9389738459216473


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.959     0.989     0.974    227825
           :      0.900     0.731     0.807       923
           !      0.489     0.184     0.268       347
           ,      0.797     0.642     0.711     18592
           ?      0.852     0.677     0.755       443
           ;      0.234     0.042     0.071       622
           .      0.933     0.987     0.959     11401
           -      0.547     0.100     0.169       882
          ,(      0.000     0.000     0.000         5
           "      0.633     0.130     0.215      1596
         ").      0.000     0.000     0.000         0
           (      0.970     0.876     0.920      1656
           )      0.879     0.838     0.858      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.769    



              precision    recall  f1-score   support

           :      0.900     0.731     0.807       923
           !      0.489     0.184     0.268       347
           ,      0.797     0.642     0.711     18592
           ?      0.852     0.677     0.755       443
           ;      0.234     0.042     0.071       622
           .      0.933     0.987     0.959     11401
           -      0.547     0.100     0.169       882
          ,(      0.000     0.000     0.000         5
           "      0.633     0.130     0.215      1596
         ").      0.000     0.000     0.000         0
           (      0.970     0.876     0.920      1656
           )      0.879     0.838     0.858      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.769     0.149     0.250        67
          ."      0.333    

In [47]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train_without_pos, y_train)

CPU times: user 27min 49s, sys: 836 ms, total: 27min 50s
Wall time: 27min 47s


In [48]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev_without_pos)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev_without_pos, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.9386445886287452


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.958     0.990     0.974    227825
           :      0.916     0.705     0.797       923
           !      0.523     0.199     0.288       347
           ,      0.807     0.630     0.708     18592
           ?      0.870     0.679     0.763       443
           ;      0.280     0.042     0.073       622
           .      0.933     0.988     0.960     11401
           -      0.591     0.100     0.171       882
          ,(      0.000     0.000     0.000         5
           "      0.620     0.128     0.212      1596
         ").      0.000     0.000     0.000         0
           (      0.973     0.872     0.920      1656
           )      0.879     0.831     0.855      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.667    



              precision    recall  f1-score   support

           :      0.916     0.705     0.797       923
           !      0.523     0.199     0.288       347
           ,      0.807     0.630     0.708     18592
           ?      0.870     0.679     0.763       443
           ;      0.280     0.042     0.073       622
           .      0.933     0.988     0.960     11401
           -      0.591     0.100     0.171       882
          ,(      0.000     0.000     0.000         5
           "      0.620     0.128     0.212      1596
         ").      0.000     0.000     0.000         0
           (      0.973     0.872     0.920      1656
           )      0.879     0.831     0.855      1001
          ?-      0.000     0.000     0.000         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.667     0.090     0.158        67
          ."      0.333    

In [49]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train_without_upos, y_train)

CPU times: user 26min 55s, sys: 721 ms, total: 26min 56s
Wall time: 26min 53s


In [50]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev_without_upos)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev_without_upos, y_pred)
print(punctuated_sentences[:100])

  average, "true nor predicted", 'F-score is', len(true_sum)


0.9388912093385249


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

                  0.959     0.989     0.974    227825
           :      0.915     0.711     0.800       923
           !      0.522     0.202     0.291       347
           ,      0.797     0.640     0.710     18592
           ?      0.860     0.679     0.759       443
           ;      0.237     0.045     0.076       622
           .      0.934     0.988     0.960     11401
           -      0.574     0.101     0.172       882
          ,(      0.000     0.000     0.000         5
           "      0.614     0.128     0.212      1596
         ").      0.000     0.000     0.000         0
           (      0.968     0.869     0.916      1656
           )      0.874     0.829     0.851      1001
          ?-      0.143     0.250     0.182         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.700    



              precision    recall  f1-score   support

           :      0.915     0.711     0.800       923
           !      0.522     0.202     0.291       347
           ,      0.797     0.640     0.710     18592
           ?      0.860     0.679     0.759       443
           ;      0.237     0.045     0.076       622
           .      0.934     0.988     0.960     11401
           -      0.574     0.101     0.172       882
          ,(      0.000     0.000     0.000         5
           "      0.614     0.128     0.212      1596
         ").      0.000     0.000     0.000         0
           (      0.968     0.869     0.916      1656
           )      0.874     0.829     0.851      1001
          ?-      0.143     0.250     0.182         4
          .-      0.000     0.000     0.000         2
          :)      0.000     0.000     0.000         0
          !-      0.000     0.000     0.000        20
          :"      0.700     0.104     0.182        67
          ."      0.333    

### Try out a model with verb count for the whole word window

In [23]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word, interrogative_word_position = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle, interrogative_particle_position = contains_interrogative_particle(sentence)
    sentence_contains_imperative_verb, imperative_verb_position = contains_imperative_verb(sentence)
    sentence_contains_repetitive_word_before, repetitive_word_tag = contains_repetitive_word_before(sentence, i)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'interrogative_word_position': interrogative_word_position,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'interrogative_particle_position': interrogative_particle_position,
        'contains_imperative_verb': sentence_contains_imperative_verb,
        'imperative_verb_position': imperative_verb_position,
        'starts_with_capital_letter': sentence[i]['text'][0].isupper(),
        'contains_repetitive_word_before': sentence_contains_repetitive_word_before,
        'repetitive_word_tag': repetitive_word_tag,
        'count_of_verbs_before': count_of_verbs_before(sentence, i)
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos'],
            'prev_word_starts_with_capital_letter': sentence[i-1]['text'][0].isupper(),
            'count_of_verbs_before_prev_word': count_of_verbs_before(sentence, i-1)
        })
    else:
        features.update({
            'BOS': True
        })

    if i > 1:
        features.update({
            'word_before_prev_word': sentence[i-2]['text'],
            'word_before_prev_word_upos': sentence[i-2]['upos'],
            'word_before_prev_word_xpos': sentence[i-2]['xpos'],
            'word_before_prev_word_starts_with_capital_letter': sentence[i-2]['text'][0].isupper(),
            'count_of_verbs_before_word_before_prev_word': count_of_verbs_before(sentence, i-2)
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos'],
            'next_word_starts_with_capital_letter': sentence[i+1]['text'][0].isupper(),
            'count_of_verbs_before_next_word': count_of_verbs_before(sentence, i+1)
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos'],
            'word_after_next_word_starts_with_capital_letter': sentence[i+2]['text'][0].isupper(),
            'count_of_verbs_before_word_after_next_word': count_of_verbs_before(sentence, i+2)
        })

    return features

In [24]:
%%time
X_train, y_train = data_prep('../data/Train.txt')

2021-10-01 12:09:48 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-10-01 12:09:48 INFO: Use device: gpu
2021-10-01 12:09:48 INFO: Loading: tokenize
2021-10-01 12:09:48 INFO: Done loading processors!
2021-10-01 12:10:02 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-10-01 12:10:02 INFO: Use device: gpu
2021-10-01 12:10:02 INFO: Loading: tokenize
2021-10-01 12:10:02 INFO: Loading: pos
2021-10-01 12:10:05 INFO: Done loading processors!


CPU times: user 20min 40s, sys: 2.47 s, total: 20min 43s
Wall time: 20min 42s


In [25]:
verify_prepped_data(X_train, y_train)

In [26]:
%%time
X_dev, y_dev = data_prep('../data/Dev.txt')

2021-10-01 12:30:31 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-10-01 12:30:31 INFO: Use device: gpu
2021-10-01 12:30:31 INFO: Loading: tokenize
2021-10-01 12:30:31 INFO: Done loading processors!
2021-10-01 12:30:44 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-10-01 12:30:44 INFO: Use device: gpu
2021-10-01 12:30:44 INFO: Loading: tokenize
2021-10-01 12:30:44 INFO: Loading: pos
2021-10-01 12:30:46 INFO: Done loading processors!


CPU times: user 11min 31s, sys: 1.28 s, total: 11min 32s
Wall time: 11min 31s


In [27]:
verify_prepped_data(X_dev, y_dev)

In [28]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 29min 33s, sys: 1.04 s, total: 29min 34s
Wall time: 29min 31s


In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Try out best model with 50% of train data

In [None]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle = contains_interrogative_particle(sentence)
    sentence_contains_imperative_verb = contains_imperative_verb(sentence)
    sentence_contains_repetitive_word_before, repetitive_word_tag = contains_repetitive_word_before(sentence, i)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'contains_imperative_verb': sentence_contains_imperative_verb,
        'starts_with_capital_letter': sentence[i]['text'][0].isupper(),
        'contains_repetitive_word_before': sentence_contains_repetitive_word_before,
        'repetitive_word_tag': repetitive_word_tag
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos'],
            'prev_word_starts_with_capital_letter': sentence[i-1]['text'][0].isupper()
        })
    else:
        features.update({
            'BOS': True
        })

    if i > 1:
        features.update({
            'word_before_prev_word': sentence[i-2]['text'],
            'word_before_prev_word_upos': sentence[i-2]['upos'],
            'word_before_prev_word_xpos': sentence[i-2]['xpos'],
            'word_before_prev_word_starts_with_capital_letter': sentence[i-2]['text'][0].isupper()
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos'],
            'next_word_starts_with_capital_letter': sentence[i+1]['text'][0].isupper()
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos'],
            'word_after_next_word_starts_with_capital_letter': sentence[i+2]['text'][0].isupper()
        })

    return features

In [None]:
%%time
X_train, y_train = data_prep('../data/Train_50.txt')

In [None]:
verify_prepped_data(X_train, y_train)

In [None]:
%%time
X_dev, y_dev = data_prep('../data/Dev.txt')

In [None]:
verify_prepped_data(X_dev, y_dev)

In [None]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels = [',','(',')','"','[',']',';','.','?','!',':','-', '...']
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

### Try out best model with 60% of train data

In [None]:
%%time
X_train, y_train = data_prep('../data/Train_60.txt')

In [None]:
verify_prepped_data(X_train, y_train)

In [None]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

In [None]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels = [',','(',')','"','[',']',';','.','?','!',':','-', '...']
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])