In [1]:
import json
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import classla
import json
import re

In [2]:
# input - name of a text file with one sentence per line
# output - list of sentences (strings)
def read_file_as_list_of_sentences(input_file_name):
    with open(input_file_name, "r") as input_file:
        return input_file.read().splitlines()

In [3]:
# input - list of sentences (strings) and a classla pipeline (POS tokenize or tokenize)
# output - list of dictionaries ((POS) tokenized sentences) 
def run_through_classla_pipeline(list_of_sentences, pipeline):
    return [pipeline(sentence).to_dict()[0][0] for sentence in list_of_sentences]

In [4]:
# input - list of dictionaries (a dictionary for each word - the tokenized version of the sentence)
# output - list of dictionaries (dictionaries that contain punctuation one after another are squashed into one)
def squash_punctuation(sentence):
    new_sentence = []

    for i in range(len(sentence)):
        if sentence[i]['text'] in ',()"[];.?!:-':
            if len(new_sentence) > 0 and all(character in ',()"[];.?!:-' for character in new_sentence[-1]['text']):
                new_sentence[-1]['text'] += sentence[i]['text']
            else:
                new_sentence.append(sentence[i])
        else:
            new_sentence.append(sentence[i])
            
    return new_sentence

In [5]:
# input - label (string)
# output - transformed label (string) - one that is part of the allowed labels
def transform_label(label):
    label_priorities = {1: ',', 2: '?', 3: '!', 4: '.', 5: ':',
                        6: '-', 7: ';', 8: '(', 9: ')', 10: '"',
                        11: '...', 12: '[', 13: ']'}
    
    allowed_labels = [',', '?', '!', '.', ':', '-', ';', '(', ')', '"',
                      '', '[', ']', '...', '",', '),', '".', ').', ':"']
    
    matched_label = next((allowed_label for allowed_label in allowed_labels[13:] if allowed_label in label),
                         False)

    
    if label in allowed_labels:
        return label
    elif matched_label:
        return matched_label
    else:
        for i in range(1, 14):
            if label_priorities[i] in label:
                return label_priorities[i]
    
    return ''

In [6]:
# input - sentence and an index of a word (dictionary) - assigns a label to each word based on the punctuation after
# output - a label - None for punctuation, punctuation for words followed by punctuation and empty for words if they are not
def word2label(sentence, i):
    if all(character in ',()"[];.?!:-' for character in sentence[i]['text']):
        return None

    if i < len(sentence) - 1:
        if all(character in ',()"[];.?!:-' for character in sentence[i+1]['text']):
            label = transform_label(sentence[i+1]['text'])
            return label
    
    return ''

In [7]:
# input - sentence (list of dictionaries)
# output - list of labels (strings) with None labels for punctuation being filtered out
def sent2labels(sentence):
    return [label for label in (word2label(sentence, i) for i in range(len(sentence))) if label != None]

In [8]:
# input - list of dictionaries (a dictionary for each word - the tokenized version of the sentence)
# output - new sentence (string) with the punctuation removed
def remove_punctuation(sentence):
    new_sentence = ''

    for i in range(len(sentence)):
        if all(character in ',()"[];.?!:-' for character in sentence[i]['text']):
            pass
        else:
            new_sentence = new_sentence + sentence[i]['text'] + ' '
    
    return new_sentence

In [9]:
# input - sentence (list of dictionaries)
# output - True/False depending on whether the sentence contains interrogative word or not
def contains_interrogative_word(sentence):
    for i in range(len(sentence)):
        if sentence[i]['xpos'].startswith('Pi'):
            return True
    
    return False

In [10]:
# input - sentence (list of dictionaries)
# output - True/False depending on whether the sentence contains interrogative particle or not
def contains_interrogative_particle(sentence):
    for i in range(len(sentence)):
        if sentence[i]['xpos'] == 'Ti':
            return True
    
    return False

In [11]:
# input - sentence (list of dictionaries)
# output - True/False depending on whether the sentence contains imperative verb or not
def contains_imperative_verb(sentence):
    for i in range(len(sentence)):
        if sentence[i]['xpos'][0] == 'V' and sentence[i]['xpos'][4] == 'z':
            return True
    
    return False

In [12]:
# input - sentence (list of dictionaries) and index of a word (dictionary)
# output - True/False depending on whether the sentence contains relative pronoun before and the tag
#          of the pronoun in the sentence (returns '' if False)
def contains_relative_pronoun_before(sentence, i):
    for word_i in reversed(range(len(sentence[:i]))):
        if sentence[word_i]['xpos'].startswith('Pr'):
            return True, sentence[word_i]['xpos']
    
    return False, ''

In [13]:
# input - sentence (list of dictionaries) and index of a word (dictionary)
# output - True/False depending on whether the sentence contains conjunction before and the tag
#          of the conjunction in the sentence (returns '' if False)
def contains_conj_before(sentence, i):
    for word_i in reversed(range(len(sentence[:i]))):
        if sentence[word_i]['xpos'].startswith('C'):
            return True, sentence[word_i]['xpos']
    
    return False, ''

In [14]:
# input - sentence (list of dictionaries) and index of a word (dictionary)
# output - True/False depending on whether the sentence contains repetitive word before and the tag
#          of the word in the sentence (returns '' if False)
def contains_repetitive_word_before(sentence, i):
    for word_i in reversed(range(len(sentence[:i]))):
        if i < len(sentence)-1 and sentence[word_i]['text'].lower() == sentence[i+1]['text'].lower():
            return True, sentence[word_i]['xpos']
    
    return False, ''

In [15]:
# input - sentence (list of dictionaries) and index of a word (dictionary)
# output - the count of the verbs before the word
def count_of_verbs_before(sentence, i):
    verbs_count = 0

    for word_i in range(len(sentence[:i])):
        if sentence[word_i]['upos'] == 'VERB':
            verbs_count += 1
    
    return verbs_count

In [16]:
def split_xpos(xpos, word='word'):
    def gender_number_article(number, gender, article):
        gender_number_article = ''
        
        for feature in (gender, number, article):
            if feature:
                gender_number_article += feature
            else:
                gender_number_article += '-'
        
        return gender_number_article

    pos2features = {'N': {'xpos_type': xpos[:2], 'xpos_gender_number_article': xpos[2:5]},
                    'A': {'xpos_type': xpos[:1], 'xpos_gender_number_article': xpos[1:4]},
                    'H': {'xpos_type': xpos[:1], 'xpos_gender_number_article': xpos[1:4]},
                    'M': {'xpos_type': xpos[:2], 'xpos_gender_number_article': xpos[2:5]},
                    'V': {'xpos_type': xpos[:2],
                          'xpos_gender_number_article': gender_number_article(xpos[8:9], xpos[9:10], xpos[10:11])},
                    'P': {'xpos_type': xpos[:2],
                          'xpos_gender_number_article': gender_number_article(xpos[5:6], xpos[7:8], xpos[8:9])},
                    'D': {'xpos_type': xpos},
                    'C': {'xpos_type': xpos},
                    'T': {'xpos_type': xpos},
                    'R': {'xpos_type': xpos},
                    'I': {'xpos_type': xpos}}
    
    result_pos2features = pos2features.get(xpos[0], {'xpos_type': xpos})
    
    if xpos[0] == 'V' and xpos[4:5] in ('z', 'c', 'g'):
        result_pos2features.update({'xpos_mood': xpos[4:5]})
    elif xpos[0] == 'P' and xpos[2:3] in ('e', 'a', 'l', 'm', 'q', 't'):
        result_pos2features.update({'xpos_ref_type': xpos[2:3]})
    elif xpos[0] == 'A' and xpos[4:5] == 'e':
        result_pos2features.update({'xpos_extended': xpos[4:5]})
    
    return {(word + '_' + key): value.rstrip('-') for key, value in result_pos2features.items()}

In [17]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features_old(sentence, i):
    sentence_contains_interrogative_word = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle = contains_interrogative_particle(sentence)
    sentence_contains_imperative_verb = contains_imperative_verb(sentence)
    sentence_contains_repetitive_word_before, repetitive_word_tag = contains_repetitive_word_before(sentence, i)

    features = {
        'word': sentence[i]['text'],
        'sent_len': len(sentence),
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'],
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'contains_imperative_verb': sentence_contains_imperative_verb,
        'starts_with_capital_letter': sentence[i]['text'][0].isupper(),
        'contains_repetitive_word_before': sentence_contains_repetitive_word_before,
        'repetitive_word_tag': repetitive_word_tag
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'],
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos'],
            'prev_word_starts_with_capital_letter': sentence[i-1]['text'][0].isupper()
        })
    else:
        features.update({
            'BOS': True
        })

    if i > 1:
        features.update({
            'word_before_prev_word': sentence[i-2]['text'],
            'word_before_prev_word_upos': sentence[i-2]['upos'],
            'word_before_prev_word_xpos': sentence[i-2]['xpos'],
            'word_before_prev_word_starts_with_capital_letter': sentence[i-2]['text'][0].isupper()
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'],
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos'],
            'next_word_starts_with_capital_letter': sentence[i+1]['text'][0].isupper()
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'],
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos'],
            'word_after_next_word_starts_with_capital_letter': sentence[i+2]['text'][0].isupper()
        })

    return features

In [18]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features_old2(sentence, i):
    sentence_contains_interrogative_word = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle = contains_interrogative_particle(sentence)
    sentence_contains_imperative_verb = contains_imperative_verb(sentence)
    sentence_contains_repetitive_word_before, repetitive_word_tag = contains_repetitive_word_before(sentence, i)
    sentence_contains_relative_pronoun_before, pronoun_tag = contains_relative_pronoun_before(sentence, i)
    sentence_contains_conjunction_before, conjunction_tag = contains_conj_before(sentence, i)
    verbs_before = count_of_verbs_before(sentence, i)

    features = {
        'word': sentence[i]['text'].lower(),
        'sent_len': len(sentence),
        'upos': sentence[i]['upos'],
        'xpos': sentence[i]['xpos'],
        'first_word_in_sent': sentence[0]['text'].lower(),
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'contains_imperative_verb': sentence_contains_imperative_verb,
        'contains_repetitive_word_before': sentence_contains_repetitive_word_before,
        'repetitive_word_tag': repetitive_word_tag,
        'contains_relative_pronoun_before': sentence_contains_relative_pronoun_before,
        'pronoun_tag': pronoun_tag,
        'contains_conjunction_before': sentence_contains_conjunction_before,
        'conjunction_tag': conjunction_tag,
        'count_of_verbs_before': verbs_before
    }

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'].lower(),
            'prev_word_upos': sentence[i-1]['upos'],
            'prev_word_xpos': sentence[i-1]['xpos']
        })
    else:
        features.update({
            'BOS': True
        })

    if i > 1:
        features.update({
            'word_before_prev_word': sentence[i-2]['text'].lower(),
            'word_before_prev_word_upos': sentence[i-2]['upos'],
            'word_before_prev_word_xpos': sentence[i-2]['xpos']
        })

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'].lower(),
            'next_word_upos': sentence[i+1]['upos'],
            'next_word_xpos': sentence[i+1]['xpos']
        })
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'].lower(),
            'word_after_next_word_upos': sentence[i+2]['upos'],
            'word_after_next_word_xpos': sentence[i+2]['xpos']
        })

    return features

In [19]:
# input - sentence (list of dictionaries) and an index of a word (dictionary)
# output - list of dictionaries (features, for each word)
def word2features(sentence, i):
    sentence_contains_interrogative_word = contains_interrogative_word(sentence)
    sentence_contains_interrogative_particle = contains_interrogative_particle(sentence)
    sentence_contains_imperative_verb = contains_imperative_verb(sentence)
    sentence_contains_repetitive_word_before, repetitive_word_tag = contains_repetitive_word_before(sentence, i)

    features = {
        'word': sentence[i]['text'].lower(),
        'sent_len': len(sentence),
        'upos': sentence[i]['upos'],
        'first_word_in_sent': sentence[0]['text'].lower(),
        'contains_interrogative_word': sentence_contains_interrogative_word,
        'contains_interrogative_particle': sentence_contains_interrogative_particle,
        'contains_imperative_verb': sentence_contains_imperative_verb,
        'contains_repetitive_word_before': sentence_contains_repetitive_word_before,
        'repetitive_word_tag': repetitive_word_tag
    }

    features.update(split_xpos(sentence[i]['xpos']))

    if i > 0:
        features.update({
            'prev_word': sentence[i-1]['text'].lower(),
            'prev_word_upos': sentence[i-1]['upos']
        })

        features.update(split_xpos(sentence[i-1]['xpos'], 'prev_word'))
    else:
        features.update({
            'BOS': True
        })

    if i > 1:
        features.update({
            'word_before_prev_word': sentence[i-2]['text'].lower(),
            'word_before_prev_word_upos': sentence[i-2]['upos']
        })
        
        features.update(split_xpos(sentence[i-2]['xpos'], 'word_before_prev_word'))

    if i < len(sentence)-1:
        features.update({
            'next_word': sentence[i+1]['text'].lower(),
            'next_word_upos': sentence[i+1]['upos']
        })
        
        features.update(split_xpos(sentence[i+1]['xpos'], 'next_word'))
    else:
        features.update({
            'EOS': True
        })
        
    if i < len(sentence)-2:
        features.update({
            'word_after_next_word': sentence [i+2]['text'].lower(),
            'word_after_next_word_upos': sentence[i+2]['upos']
        })
        
        features.update(split_xpos(sentence[i+2]['xpos'], 'word_after_next_word'))

    return features

In [20]:
# input - sentence (list of dictionaries)
# output - list of features (dictionaries, for each word)
def sent2features(sentence):
    return [word2features(sentence, i) for i in range(len(sentence))]

In [21]:
# input - JSON-serializable data and name of the output JSON file
# output - None, saves the sentences to a JSON file
def save_as_json(data, output_file_name):
    with open(output_file_name, "w") as output_file:
        json.dump(data, output_file) 

In [22]:
# input - JSON file
# output - the contents of the JSON file as an object
def load_json(json_file_name):
    with open(json_file_name, "r") as json_file:
        return json.load(json_file)

In [23]:
# input - name of a text file with one sentence per line and a variable indicating whether or not to save X and y to JSON
# output - X and y - features and labels
def data_prep(input_file_name, json_serialize=False):
    data = read_file_as_list_of_sentences(input_file_name)
    nlp_tokenize = classla.Pipeline('bg', processors='tokenize')
    tokenized_data = run_through_classla_pipeline(data, nlp_tokenize)
    
    if len(data) != len(tokenized_data):
        print("Warning: Mismatch in the count of the data and tokenized data")

    squashed_tokenized_data = [squash_punctuation(sentence) for sentence in tokenized_data]

    if len(tokenized_data) != len(squashed_tokenized_data):
        print("Warning: Mismatch in the count of the tokenized and squashed tokenized data")
    
    y = [sent2labels(sentence) for sentence in squashed_tokenized_data]
    
    if len(squashed_tokenized_data) != len(y):
        print("Warning: Mismatch in the count of the squashed tokenized data and labeled data")
    
    data_without_punctuation = [remove_punctuation(sentence) for sentence in squashed_tokenized_data]
    
    if len(data_without_punctuation) != len(y):
        print("Warning: Mismatch in the count of the data without punctuation and labeled data")
    
    nlp_pos_tokenize = classla.Pipeline('bg', processors='tokenize,pos')   
    pos_tokenized_data = run_through_classla_pipeline(data_without_punctuation, nlp_pos_tokenize)
    
    if len(data_without_punctuation) != len(pos_tokenized_data):
        print("Warning: Mismatch in the count of the data without punctuation and POS tokenized data")
    
    X = [sent2features(sentence) for sentence in pos_tokenized_data]
    
    if len(X) != len(pos_tokenized_data):
        print("Warning: Mismatch in the count of the prepped data and POS tokenized data")
    
    if json_serialize:
        save_as_json(X, re.sub('\.txt', '_X.json', input_file_name))
        save_as_json(y, re.sub('\.txt', '_y.json', input_file_name))
        
    return X, y

In [24]:
# input - X and y - features and labels
# output - None, prints on the screen the features-labels pairs that have length mismatch
def verify_prepped_data(X, y):
    for feat, label in zip(X, y):
        if len(feat) != len(label):
            print(feat, label)

In [25]:
# input - X and y - features and labels
# output - list of punctuated sentences (strings; y labels applied to X)
def punctuate(X, y):
    punctuated_sentences = []

    for feat, label in zip(X, y):
        sentence = ''

        for i in range(len(feat)):
            sentence = sentence + feat[i]['word'] + label[i] + ' '
        
        punctuated_sentences.append(sentence.rstrip())
    
    return punctuated_sentences