In [None]:
import json
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import classla
import json

In [None]:
# input - name of a text file with one sentence per line
# output - list of sentences (strings)
def read_file_as_list_of_sentences(input_file_name):
    with open(input_file_name, "r") as input_file:
        return input_file.read().splitlines()

In [None]:
nlp_pos_tokenize = classla.Pipeline('bg', processors='tokenize,pos')

In [None]:
nlp_tokenize = classla.Pipeline('bg', processors='tokenize')

In [None]:
# input - list of sentences (strings) and a classla pipeline (POS tokenize or tokenize)
# output - list of dictionaries ((POS) tokenized sentences) 
def run_through_classla_pipeline(list_of_sentences, pipeline):
    return [pipeline(sentence).to_dict()[0][0] for sentence in list_of_sentences]

In [None]:
# input - list of dictionaries (a dictionary for each word - the tokenized version of the sentence)
# output - list of dictionaries (dictionaries that contain punctuation one after another are squashed into one)
def squash_punctuation(sentence):
    new_sentence = []

    for i in range(len(sentence)):
        if sentence[i]['text'] in ',()"[];.?!:-':
            if len(new_sentence) > 0 and all(character in ',()"[];.?!:-' for character in new_sentence[-1]['text']):
                new_sentence[-1]['text'] += sentence[i]['text']
            else:
                new_sentence.append(sentence[i])
        else:
            new_sentence.append(sentence[i])
            
    return new_sentence

In [None]:
# input - sentence and an index of a word (dictionary) - assigns a label to each word based on the punctuation after
# output - a label - None for punctuation, punctuation for words followed by punctuation and empty for words if they are not
def word2label(sentence, i):
    if all(character in ',()"[];.?!:-' for character in sentence[i]['text']):
        return None

    if i < len(sentence) - 1:
        if all(character in ',()"[];.?!:-' for character in sentence[i+1]['text']):
            label = sentence[i+1]['text']
            return label
    
    return ''

In [None]:
# input - sentence (list of dictionaries)
# output - list of labels (strings) with None labels for punctuation being filtered out
def sent2labels(sentence):
    return [label for label in (word2label(sentence, i) for i in range(len(sentence))) if label != None]

In [None]:
# input - list of dictionaries (a dictionary for each word - the tokenized version of the sentence)
# output - new sentence (string) with the punctuation removed
def remove_punctuation(sentence):
    new_sentence = ''

    for i in range(len(sentence)):
        if all(character not in ',()"[];.?!:-' for character in sentence[i]['text']):
            new_sentence = new_sentence + sentence[i]['text'] + ' '
    
    return new_sentence

In [None]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'pos_in_sent': i,
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text']
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text']
        })
    else:
        features.update({
            'BOS': True
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text']
        })
    else:
        features.update({
            'EOS': True
        })

    return features

In [None]:
# input - sentence (list of dictionaries)
# output - list of features (dictionaries, for each word)
def sent2features(sentence):
    return [word2features(sentence, i) for i in range(len(sentence))]

In [None]:
# input - a list of lists of dictionaries ((POS) tokenized sentences) and name of the output JSON file
# output - None, saves the sentences to a JSON file
def save_pos_tokenized_sentences_as_json(pos_tokenized_sentences, output_file_name):
    with open(output_file_name, "w") as output_file:
        json.dump(pos_tokenized_sentences, output_file) 

In [None]:
sentence = 'И ще изкараш ли виновен Праведния и Могъщия, който казва на цар: Негоден си. - и на княз: Беззаконник. - който пристрастие към първенци не показва, нито зачита богатия повече от бедния, понеже всички те са дело на ръцете Му!'

In [None]:
tokenized_sentence = run_through_classla_pipeline([sentence], nlp_tokenize)

In [None]:
tokenized_sentence

In [None]:
squashed_sent = squash_punctuation(tokenized_sentence[0])

In [None]:
squashed_sent

In [None]:
sent2labels(squashed_sent)

In [None]:
new_sentence = remove_punctuation(squashed_sent)

In [None]:
new_sentence

In [None]:
run_through_classla_pipeline([new_sentence], nlp_pos_tokenize)

In [None]:
train_data =  read_file_as_list_of_sentences('../data/Bible/processed/Bibliia_clean_dev.txt')

In [None]:
len(train_data)

In [None]:
nlp_tokenize = classla.Pipeline('bg', processors='tokenize')

In [None]:
tokenized_train_data = run_through_classla_pipeline(train_data, nlp_tokenize)

In [None]:
squashed_tokenized_train_data = [squash_punctuation(sentence) for sentence in tokenized_train_data]

In [None]:
y_train = [sent2labels(sentence) for sentence in squashed_tokenized_train_data]

In [None]:
train_data_without_punctuation = [remove_punctuation(sentence) for sentence in squashed_tokenized_train_data]

In [None]:
nlp_pos_tokenize = classla.Pipeline('bg', processors='tokenize,pos')

In [None]:
pos_tokenized_train_data = run_through_classla_pipeline(train_data_without_punctuation, nlp_pos_tokenize)

In [None]:
X_train = [sent2features(sentence) for sentence in pos_tokenized_train_data]

In [None]:
for sentence in X_train:
    for label in y_train:
        if len(sentence) != len(label):
            print(sentence, label)

In [None]:
f2 = open('../data/Bible/processed/Bibliia_clean_test.json', "r", encoding="utf-8")

In [None]:
len([{'word': 'А', 'sent_len': 9, 'pos_in_sent': 0, 'upos': 'CCONJ', 'xpos': 'Cp', 'first_word_in_sent': 'А', 'BOS': True, 'next_word': 'Йоас'}, {'word': 'Йоас', 'sent_len': 9, 'pos_in_sent': 1, 'upos': 'PROPN', 'xpos': 'Hmsi', 'first_word_in_sent': 'А', 'prev_word': 'А', 'next_word': 'беше'}, {'word': 'беше', 'sent_len': 9, 'pos_in_sent': 2, 'upos': 'AUX', 'xpos': 'Vxitf-t3s', 'first_word_in_sent': 'А', 'prev_word': 'Йоас', 'next_word': 'погребан'}, {'word': 'погребан', 'sent_len': 9, 'pos_in_sent': 3, 'upos': 'VERB', 'xpos': 'Vpptcv--smi', 'first_word_in_sent': 'А', 'prev_word': 'беше', 'next_word': 'в'}, {'word': 'в', 'sent_len': 9, 'pos_in_sent': 4, 'upos': 'ADP', 'xpos': 'R', 'first_word_in_sent': 'А', 'prev_word': 'погребан', 'next_word': 'Самария'}, {'word': 'Самария', 'sent_len': 9, 'pos_in_sent': 5, 'upos': 'PROPN', 'xpos': 'Npfsi', 'first_word_in_sent': 'А', 'prev_word': 'в', 'next_word': 'при'}, {'word': 'при', 'sent_len': 9, 'pos_in_sent': 6, 'upos': 'ADP', 'xpos': 'R', 'first_word_in_sent': 'А', 'prev_word': 'Самария', 'next_word': 'израилевите'}, {'word': 'израилевите', 'sent_len': 9, 'pos_in_sent': 7, 'upos': 'ADJ', 'xpos': 'A-pd', 'first_word_in_sent': 'А', 'prev_word': 'при', 'next_word': 'царе'}, {'word': 'царе', 'sent_len': 9, 'pos_in_sent': 8, 'upos': 'NOUN', 'xpos': 'Ncmpi', 'first_word_in_sent': 'А', 'prev_word': 'израилевите', 'EOS': True}])

In [None]:
 len(['', '', '', '', ':', '', '', '', '', '', '', '', '', '-', '', '', '', '', '', '', '', '', '.'])

In [None]:
data = json.load(f)

In [None]:
test_data = json.load(f2)

In [None]:
f3 = open('../data/Bible/processed/Bibliia_clean_one_sent.json', "r", encoding="utf-8")

In [None]:
f3_data = json.load(f3)

In [None]:
%%time
X_train = [sent2features(sent) for sent in data]
y_train = [sent2labels(sent) for sent in data]

X_test = [sent2features(sent) for sent in test_data]
y_test = [sent2labels(sent) for sent in test_data]

In [None]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

In [None]:
labels = list(crf.classes_)

In [None]:
labels

In [None]:
labels.remove('')

In [None]:
labels

In [None]:
y_pred = crf.predict(X_test)

In [None]:
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)