In [1]:
import json
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import classla
import json

In [2]:
# input - name of a text file with one sentence per line
# output - list of sentences (strings)
def read_file_as_list_of_sentences(input_file_name):
    with open(input_file_name, "r") as input_file:
        return input_file.read().splitlines()

In [5]:
# input - list of sentences (strings) and a classla pipeline (POS tokenize or tokenize)
# output - list of dictionaries ((POS) tokenized sentences) 
def run_through_classla_pipeline(list_of_sentences, pipeline):
    return [pipeline(sentence).to_dict()[0][0] for sentence in list_of_sentences]

In [6]:
# input - list of dictionaries (a dictionary for each word - the tokenized version of the sentence)
# output - list of dictionaries (dictionaries that contain punctuation one after another are squashed into one)
def squash_punctuation(sentence):
    new_sentence = []

    for i in range(len(sentence)):
        if sentence[i]['text'] in ',()"[];.?!:-':
            if len(new_sentence) > 0 and all(character in ',()"[];.?!:-' for character in new_sentence[-1]['text']):
                new_sentence[-1]['text'] += sentence[i]['text']
            else:
                new_sentence.append(sentence[i])
        else:
            new_sentence.append(sentence[i])
            
    return new_sentence

In [7]:
# input - sentence and an index of a word (dictionary) - assigns a label to each word based on the punctuation after
# output - a label - None for punctuation, punctuation for words followed by punctuation and empty for words if they are not
def word2label(sentence, i):
    if all(character in ',()"[];.?!:-' for character in sentence[i]['text']):
        return None

    if i < len(sentence) - 1:
        if all(character in ',()"[];.?!:-' for character in sentence[i+1]['text']):
            label = sentence[i+1]['text']
            return label
    
    return ''

In [8]:
# input - sentence (list of dictionaries)
# output - list of labels (strings) with None labels for punctuation being filtered out
def sent2labels(sentence):
    return [label for label in (word2label(sentence, i) for i in range(len(sentence))) if label != None]

In [46]:
# input - list of dictionaries (a dictionary for each word - the tokenized version of the sentence)
# output - new sentence (string) with the punctuation removed
def remove_punctuation(sentence):
    new_sentence = ''

    for i in range(len(sentence)):
        if all(character in ',()"[];.?!:-' for character in sentence[i]['text']):
            pass
        else:
            new_sentence = new_sentence + sentence[i]['text'] + ' '
    
    return new_sentence

In [10]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text']
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text']
        })
    else:
        features.update({
            'BOS': True
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text']
        })
    else:
        features.update({
            'EOS': True
        })

    return features

In [11]:
# input - sentence (list of dictionaries)
# output - list of features (dictionaries, for each word)
def sent2features(sentence):
    return [word2features(sentence, i) for i in range(len(sentence))]

In [12]:
# input - a list of lists of dictionaries ((POS) tokenized sentences) and name of the output JSON file
# output - None, saves the sentences to a JSON file
def save_pos_tokenized_sentences_as_json(pos_tokenized_sentences, output_file_name):
    with open(output_file_name, "w") as output_file:
        json.dump(pos_tokenized_sentences, output_file) 

In [68]:
# input - name of a text file with one sentence per line
# output - X and y - features and labels
def data_prep(input_file_name):
    data = read_file_as_list_of_sentences(input_file_name)
    nlp_tokenize = classla.Pipeline('bg', processors='tokenize')
    tokenized_data = run_through_classla_pipeline(data, nlp_tokenize)
    
    if len(data) != len(tokenized_data):
        print("Warning: Mismatch in the count of the data and tokenized data")

    squashed_tokenized_data = [squash_punctuation(sentence) for sentence in tokenized_data]

    if len(tokenized_data) != len(squashed_tokenized_data):
        print("Warning: Mismatch in the count of the tokenized and squashed tokenized data")
    
    y = [sent2labels(sentence) for sentence in squashed_tokenized_data]
    
    if len(squashed_tokenized_data) != len(y):
        print("Warning: Mismatch in the count of the squashed tokenized data and labeled data")
    
    data_without_punctuation = [remove_punctuation(sentence) for sentence in squashed_tokenized_data]
    
    if len(data_without_punctuation) != len(y):
        print("Warning: Mismatch in the count of the data without punctuation and labeled data")
    
    nlp_pos_tokenize = classla.Pipeline('bg', processors='tokenize,pos')   
    pos_tokenized_data = run_through_classla_pipeline(data_without_punctuation, nlp_pos_tokenize)
    
    if len(data_without_punctuation) != len(pos_tokenized_data):
        print("Warning: Mismatch in the count of the data without punctuation and POS tokenized data")
    
    X = [sent2features(sentence) for sentence in pos_tokenized_data]
    
    if len(X) != len(pos_tokenized_data):
        print("Warning: Mismatch in the count of the prepped data and POS tokenized data")
        
    return X, y

In [67]:
# input - X and y - features and labels
# output - None, prints on the screen the features-labels pairs that have length mismatch
def verify_prepped_data(X, y):
    for feat, label in zip(X, y):
        if len(feat) != len(label):
            print(feat, label)

In [69]:
%%time
X, y = data_prep('../data/Bible/processed/Bibliia_clean_dev.txt')

2021-09-22 20:04:25 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-22 20:04:25 INFO: Use device: gpu
2021-09-22 20:04:25 INFO: Loading: tokenize
2021-09-22 20:04:25 INFO: Done loading processors!
2021-09-22 20:04:26 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-22 20:04:26 INFO: Use device: gpu
2021-09-22 20:04:26 INFO: Loading: tokenize
2021-09-22 20:04:26 INFO: Loading: pos
2021-09-22 20:04:27 INFO: Done loading processors!


CPU times: user 2min 53s, sys: 442 ms, total: 2min 53s
Wall time: 2min 53s


In [70]:
verify_prepped_data(X, y)

In [71]:
%%time
X_test, y_test = data_prep('../data/Bible/processed/Bibliia_clean_test.txt')

2021-09-22 20:50:13 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2021-09-22 20:50:13 INFO: Use device: gpu
2021-09-22 20:50:13 INFO: Loading: tokenize
2021-09-22 20:50:13 INFO: Done loading processors!
2021-09-22 20:50:15 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2021-09-22 20:50:15 INFO: Use device: gpu
2021-09-22 20:50:15 INFO: Loading: tokenize
2021-09-22 20:50:15 INFO: Loading: pos
2021-09-22 20:50:16 INFO: Done loading processors!


CPU times: user 2min 17s, sys: 314 ms, total: 2min 17s
Wall time: 2min 17s


In [72]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X, y)

CPU times: user 21.9 s, sys: 14.2 ms, total: 21.9 s
Wall time: 21.9 s


In [73]:
labels = list(crf.classes_)

In [74]:
labels

['',
 ',',
 ';',
 '.',
 '-',
 '!',
 '?',
 ':',
 ':"',
 '"?',
 '!-',
 '."',
 '(',
 ')',
 '";',
 '),',
 '?-',
 '.-',
 ').',
 ',(',
 ':(',
 '!"',
 ',)',
 '"',
 '",',
 '!;',
 '?"',
 '?)',
 '.)']

In [75]:
labels.remove('')

In [76]:
labels

[',',
 ';',
 '.',
 '-',
 '!',
 '?',
 ':',
 ':"',
 '"?',
 '!-',
 '."',
 '(',
 ')',
 '";',
 '),',
 '?-',
 '.-',
 ').',
 ',(',
 ':(',
 '!"',
 ',)',
 '"',
 '",',
 '!;',
 '?"',
 '?)',
 '.)']

In [77]:
y_pred = crf.predict(X_test)

In [78]:
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

  average, "true nor predicted", 'F-score is', len(true_sum)


0.6191757607661913

In [79]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           !      0.478     0.110     0.178       292
           "      0.000     0.000     0.000         4
           (      0.000     0.000     0.000         7
           )      0.000     0.000     0.000         4
           ,      0.729     0.520     0.607      6558
           -      0.452     0.081     0.137       346
           .      0.843     0.983     0.908      2698
           :      0.927     0.652     0.766       784
           ;      0.150     0.018     0.032       665
           ?      0.730     0.249     0.372       293
          !"      0.000     0.000     0.000         1
          ."      0.000     0.000     0.000        14
          :"      0.500     0.087     0.148        23
          ?"      0.000     0.000     0.000         2
          ,(      0.000     0.000     0.000         3
          :(      0.000     0.000     0.000         0
          ,)      0.000     0.000     0.000         0
          .)      0.000    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
