### Function Definitions

In [1]:
import json
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import classla
import json
import re

In [2]:
# input - name of a text file with one sentence per line
# output - list of sentences (strings)
def read_file_as_list_of_sentences(input_file_name):
    with open(input_file_name, "r") as input_file:
        return input_file.read().splitlines()

In [3]:
# input - list of sentences (strings) and a classla pipeline (POS tokenize or tokenize)
# output - list of dictionaries ((POS) tokenized sentences) 
def run_through_classla_pipeline(list_of_sentences, pipeline):
    return [pipeline(sentence).to_dict()[0][0] for sentence in list_of_sentences]

In [4]:
# input - list of dictionaries (a dictionary for each word - the tokenized version of the sentence)
# output - list of dictionaries (dictionaries that contain punctuation one after another are squashed into one)
def squash_punctuation(sentence):
    new_sentence = []

    for i in range(len(sentence)):
        if sentence[i]['text'] in ',()"[];.?!:-':
            if len(new_sentence) > 0 and all(character in ',()"[];.?!:-' for character in new_sentence[-1]['text']):
                new_sentence[-1]['text'] += sentence[i]['text']
            else:
                new_sentence.append(sentence[i])
        else:
            new_sentence.append(sentence[i])
            
    return new_sentence

In [5]:
# input - sentence and an index of a word (dictionary) - assigns a label to each word based on the punctuation after
# output - a label - None for punctuation, punctuation for words followed by punctuation and empty for words if they are not
def word2label(sentence, i):
    if all(character in ',()"[];.?!:-' for character in sentence[i]['text']):
        return None

    if i < len(sentence) - 1:
        if all(character in ',()"[];.?!:-' for character in sentence[i+1]['text']):
            label = sentence[i+1]['text']
            return label
    
    return ''

In [6]:
# input - sentence (list of dictionaries)
# output - list of labels (strings) with None labels for punctuation being filtered out
def sent2labels(sentence):
    return [label for label in (word2label(sentence, i) for i in range(len(sentence))) if label != None]

In [7]:
# input - list of dictionaries (a dictionary for each word - the tokenized version of the sentence)
# output - new sentence (string) with the punctuation removed
def remove_punctuation(sentence):
    new_sentence = ''

    for i in range(len(sentence)):
        if all(character in ',()"[];.?!:-' for character in sentence[i]['text']):
            pass
        else:
            new_sentence = new_sentence + sentence[i]['text'] + ' '
    
    return new_sentence

In [8]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text']
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text']
        })
    else:
        features.update({
            'BOS': True
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text']
        })
    else:
        features.update({
            'EOS': True
        })

    return features

In [9]:
# input - sentence (list of dictionaries)
# output - list of features (dictionaries, for each word)
def sent2features(sentence):
    return [word2features(sentence, i) for i in range(len(sentence))]

In [10]:
# input - JSON-serializable data and name of the output JSON file
# output - None, saves the sentences to a JSON file
def save_as_json(data, output_file_name):
    with open(output_file_name, "w") as output_file:
        json.dump(data, output_file) 

In [11]:
# input - name of a text file with one sentence per line and a variable indicating whether or not to save X and y to JSON
# output - X and y - features and labels
def data_prep(input_file_name, json_serialize=False):
    data = read_file_as_list_of_sentences(input_file_name)
    nlp_tokenize = classla.Pipeline('bg', processors='tokenize')
    tokenized_data = run_through_classla_pipeline(data, nlp_tokenize)
    
    if len(data) != len(tokenized_data):
        print("Warning: Mismatch in the count of the data and tokenized data")

    squashed_tokenized_data = [squash_punctuation(sentence) for sentence in tokenized_data]

    if len(tokenized_data) != len(squashed_tokenized_data):
        print("Warning: Mismatch in the count of the tokenized and squashed tokenized data")
    
    y = [sent2labels(sentence) for sentence in squashed_tokenized_data]
    
    if len(squashed_tokenized_data) != len(y):
        print("Warning: Mismatch in the count of the squashed tokenized data and labeled data")
    
    data_without_punctuation = [remove_punctuation(sentence) for sentence in squashed_tokenized_data]
    
    if len(data_without_punctuation) != len(y):
        print("Warning: Mismatch in the count of the data without punctuation and labeled data")
    
    nlp_pos_tokenize = classla.Pipeline('bg', processors='tokenize,pos')   
    pos_tokenized_data = run_through_classla_pipeline(data_without_punctuation, nlp_pos_tokenize)
    
    if len(data_without_punctuation) != len(pos_tokenized_data):
        print("Warning: Mismatch in the count of the data without punctuation and POS tokenized data")
    
    X = [sent2features(sentence) for sentence in pos_tokenized_data]
    
    if len(X) != len(pos_tokenized_data):
        print("Warning: Mismatch in the count of the prepped data and POS tokenized data")
    
    if json_serialize:
        save_as_json(X, re.sub('\.txt', '_X.txt', input_file_name))
        save_as_json(y, re.sub('\.txt', '_y.txt', input_file_name))
        
    return X, y

In [12]:
# input - X and y - features and labels
# output - None, prints on the screen the features-labels pairs that have length mismatch
def verify_prepped_data(X, y):
    for feat, label in zip(X, y):
        if len(feat) != len(label):
            print(feat, label)

### Data Prep

In [13]:
%%time
X_train, y_train = data_prep('../data/Bible/processed/Bibliia_clean_train.txt')

2021-09-25 21:11:17 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-25 21:11:17 INFO: Use device: gpu
2021-09-25 21:11:17 INFO: Loading: tokenize
2021-09-25 21:11:17 INFO: Done loading processors!
2021-09-25 21:11:20 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-25 21:11:20 INFO: Use device: gpu
2021-09-25 21:11:20 INFO: Loading: tokenize
2021-09-25 21:11:20 INFO: Loading: pos
2021-09-25 21:11:25 INFO: Done loading processors!


CPU times: user 5min 40s, sys: 1.82 s, total: 5min 41s
Wall time: 5min 41s


In [15]:
verify_prepped_data(X_train, y_train)

In [16]:
%%time
X_dev, y_dev = data_prep('../data/Bible/processed/Bibliia_clean_dev.txt')

2021-09-25 21:21:48 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-25 21:21:48 INFO: Use device: gpu
2021-09-25 21:21:48 INFO: Loading: tokenize
2021-09-25 21:21:48 INFO: Done loading processors!
2021-09-25 21:21:49 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-25 21:21:49 INFO: Use device: gpu
2021-09-25 21:21:49 INFO: Loading: tokenize
2021-09-25 21:21:49 INFO: Loading: pos
2021-09-25 21:21:50 INFO: Done loading processors!


CPU times: user 3min, sys: 404 ms, total: 3min
Wall time: 3min


### CRF Model

In [17]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 2min 15s, sys: 95.9 ms, total: 2min 15s
Wall time: 2min 15s


In [18]:
labels = list(crf.classes_)

In [19]:
labels

['',
 ':',
 '!',
 ',',
 '?',
 ';',
 '.',
 '-',
 ',(',
 '"',
 '").',
 '(',
 ')',
 '?-',
 '.-',
 ':)',
 '!-',
 ':"',
 '."',
 ':(',
 '?)',
 '"?',
 '".',
 '";',
 '"-',
 ').',
 '!,',
 '-"',
 ')!',
 '",',
 '.")',
 '!;',
 ';(',
 '),',
 '?;',
 '?"',
 ');',
 '.)',
 '?,',
 ';"',
 '):']

In [20]:
labels.remove('')

In [21]:
labels

[':',
 '!',
 ',',
 '?',
 ';',
 '.',
 '-',
 ',(',
 '"',
 '").',
 '(',
 ')',
 '?-',
 '.-',
 ':)',
 '!-',
 ':"',
 '."',
 ':(',
 '?)',
 '"?',
 '".',
 '";',
 '"-',
 ').',
 '!,',
 '-"',
 ')!',
 '",',
 '.")',
 '!;',
 ';(',
 '),',
 '?;',
 '?"',
 ');',
 '.)',
 '?,',
 ';"',
 '):']

In [22]:
y_pred = crf.predict(X_dev)

In [23]:
metrics.flat_f1_score(y_dev, y_pred,
                      average='weighted', labels=labels)

  average, "true nor predicted", 'F-score is', len(true_sum)


0.6386622718017901

In [24]:
# show all metrics
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_dev, y_pred, labels=sorted_labels, digits=3
))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           !      0.533     0.189     0.279       297
           "      0.000     0.000     0.000         3
           (      0.000     0.000     0.000         8
           )      0.000     0.000     0.000         4
           ,      0.738     0.546     0.628      6602
           -      0.381     0.054     0.094       298
           .      0.842     0.976     0.904      2675
           :      0.904     0.710     0.795       799
           ;      0.171     0.021     0.038       562
           ?      0.717     0.229     0.347       310
          )!      0.000     0.000     0.000         1
          -"      0.000     0.000     0.000         1
          ."      0.000     0.000     0.000        18
          :"      0.500     0.077     0.133        26
          ;"      0.000     0.000     0.000         0
          ?"      0.000     0.000     0.000         0
         .")      0.000     0.000     0.000         0
          ,(      0.000    

### Grid Search Try - Worse Results

In [None]:
# from sklearn.metrics import make_scorer
# import scipy.stats
# from sklearn.model_selection import RandomizedSearchCV

In [None]:
# %%time
# # define fixed parameters and parameters to search
# crf = sklearn_crfsuite.CRF(
#     algorithm='lbfgs',
#     max_iterations=100,
#     all_possible_transitions=True
# )
# params_space = {
#     'c1': scipy.stats.expon(scale=0.5),
#     'c2': scipy.stats.expon(scale=0.05),
# }

# # use the same metric for evaluation
# f1_scorer = make_scorer(metrics.flat_f1_score,
#                         average='weighted', labels=labels)

# # search
# rs = RandomizedSearchCV(crf, params_space,
#                         cv=3,
#                         verbose=1,
#                         n_jobs=-1,
#                         n_iter=50,
#                         scoring=f1_scorer)
# rs.fit(X, y)

In [None]:
# crf = rs.best_estimator_
# print('best params:', rs.best_params_)
# print('best CV score:', rs.best_score_)
# print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

In [None]:
# crf = rs.best_estimator_
# y_pred = crf.predict(X_test)

# sorted_labels = sorted(
#     labels,
#     key=lambda name: (name[1:], name[0])
# )

# print(metrics.flat_classification_report(
#     y_test, y_pred, labels=sorted_labels, digits=3
# ))