In [2]:
# !pip install ufal.udpipe
# !pip install spacy-stanza
!pip install --upgrade spacy-stanza
!pip install stanza

Collecting spacy-stanza
  Using cached spacy_stanza-1.0.4-py3-none-any.whl.metadata (8.6 kB)
Collecting stanza<1.7.0,>=1.2.0 (from spacy-stanza)
  Using cached stanza-1.6.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza<1.7.0,>=1.2.0->spacy-stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Using cached spacy_stanza-1.0.4-py3-none-any.whl (9.7 kB)
Using cached stanza-1.6.1-py3-none-any.whl (881 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
   ---------------------------------------- 0.0/590.6 kB ? eta -:--:--
   --------------------------------------- 590.6/590.6 kB 10.6 MB/s eta 0:00:00
Installing collected packages: emoji, stanza, spacy-stanza
Successfully installed emoji-2.14.1 spacy-stanza-1.0.4 stanza-1.6.1


In [3]:
from spacy.tokens import Doc
# from ufal.udpipe import Model, Pipeline

import os
import pandas as pd
import re
import spacy
import spacy_stanza
import stanza
import time
import urllib.request
import warnings

In [4]:
# Suppress specific warnings
warnings.filterwarnings("ignore", category=FutureWarning, message=".*torch.load.*")

## 3 Experiments with Stanza and UDPipe

### 3.1 Download and initialize Bulgarian models

In [5]:
# Download Stanza
if not os.path.isdir("./stanza_resources/bg"):
    stanza.download("bg", model_dir="./stanza_resources")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2025-01-22 15:35:57 INFO: Downloading default packages for language: bg (Bulgarian) ...


Downloading https://huggingface.co/stanfordnlp/stanza-bg/resolve/v1.6.0/models/default.zip:   0%|          | 0…

2025-01-22 15:36:40 INFO: Finished downloading models and saved to ./stanza_resources.


In [6]:
processors = "tokenize,pos,lemma"  # A string with comma-separated processor names

# Initialize the pipeline
nlp_spacy_stanza = spacy_stanza.load_pipeline(
    "bg",
    dir="./stanza_resources",
    processors=processors
)

# nlp_spacy_stanza = spacy_stanza.load_pipeline("bg", dir="./stanza_resources")

2025-01-22 15:36:44 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2025-01-22 15:36:45 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package      |
----------------------------
| tokenize  | btb          |
| pos       | btb_charlm   |
| lemma     | btb_nocharlm |

2025-01-22 15:36:45 INFO: Using device: cpu
2025-01-22 15:36:45 INFO: Loading: tokenize
2025-01-22 15:36:47 INFO: Loading: pos
2025-01-22 15:36:48 INFO: Loading: lemma
2025-01-22 15:36:48 INFO: Done loading processors!


### 3.2 Define initial functions

In [7]:
def nlp_st(txt):
    return nlp_spacy_stanza(txt)

### 3.3 Compare NLP features

In [8]:
# A helper function for displaying useful NLP features in easy to read format
def inspect_spacy_doc(doc):
    for token in doc:
        print(f"Token: {token.text:<15} Tag: {token.tag_:<15} POS: {token.pos_:<10} Lemma: {token.lemma_:<15} Dep: {token.dep_:<10}")
    print("\n")

In [9]:
sentence = """Сред гостите на официалната церемония по встъпване в длъжност на новоизбрания президент ще бъдат българският \
държавен глава Румен Радев, президентът на Албания Илир Мета и на Косово Хашим Тачи."""

### 3.5 Compare performance with large amount of data

In [12]:
# Create a DataFrame with 10 rows, use same sentence for simplicity
test_df = pd.DataFrame({"text": [sentence] * 10})

In [13]:
def sent_features_to_string(sent):
    """Converts sentence features to strings"""
    words = [tk.pos_ for tk in sent]
    lemmas = [(tk.lemma_) for tk in sent]
    pos = [tk.pos_ for tk in sent]
    tag = [tk.tag_ for tk in sent]    # morph?
    morph = [str(tk.morph) for tk in sent] # features?
    dep = [tk.dep_ for tk in sent]
    n_words = len(sent)

    return words, lemmas, pos, tag, morph, dep, n_words



def extract_features(nlp, row, col):
    """Extracts linguistic features from a text column using a given NLP model"""
    value = row[col]
    doc = nlp(value)
    return sent_features_to_string(doc)

#### 3.5.1 Extracting features using Stanza

In [14]:
start_time = time.time()

test_df[["words_s", "lemmas_s", "pos_s", "morph_s", "features_s", "dep_s", "n_words_s"]] = test_df.apply(
    lambda r: extract_features(nlp_st, r, "text"), axis=1, result_type="expand"
)

end_time = time.time()
execution_time_st = end_time - start_time

In [15]:
print(f"Execution time (stanza): {execution_time_st} seconds")

Execution time (stanza): 6.670640468597412 seconds


In [42]:
test_df

Unnamed: 0,text,pos,tag,dep,morph,lemmas,left_edge,right_edge,num_words,words_s,lemmas_s,pos_s,morph_s,features_s,dep_s,n_words_s
0,Сред гостите на официалната церемония по встъп...,1,2,3,4,5,6,7,8,"[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[сред, гост, на, официален, церемония, по, вст...","[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[R, Ncmpd, R, Afsd, Ncfsi, R, Ncnsi, R, Ncfsi,...","[, Definite=Def|Gender=Masc|Number=Plur, , Def...","[case, root, case, amod, nmod, case, nmod, cas...",31
1,Сред гостите на официалната церемония по встъп...,1,2,3,4,5,6,7,8,"[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[сред, гост, на, официален, церемония, по, вст...","[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[R, Ncmpd, R, Afsd, Ncfsi, R, Ncnsi, R, Ncfsi,...","[, Definite=Def|Gender=Masc|Number=Plur, , Def...","[case, root, case, amod, nmod, case, nmod, cas...",31
2,Сред гостите на официалната церемония по встъп...,1,2,3,4,5,6,7,8,"[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[сред, гост, на, официален, церемония, по, вст...","[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[R, Ncmpd, R, Afsd, Ncfsi, R, Ncnsi, R, Ncfsi,...","[, Definite=Def|Gender=Masc|Number=Plur, , Def...","[case, root, case, amod, nmod, case, nmod, cas...",31
3,Сред гостите на официалната церемония по встъп...,1,2,3,4,5,6,7,8,"[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[сред, гост, на, официален, церемония, по, вст...","[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[R, Ncmpd, R, Afsd, Ncfsi, R, Ncnsi, R, Ncfsi,...","[, Definite=Def|Gender=Masc|Number=Plur, , Def...","[case, root, case, amod, nmod, case, nmod, cas...",31
4,Сред гостите на официалната церемония по встъп...,1,2,3,4,5,6,7,8,"[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[сред, гост, на, официален, церемония, по, вст...","[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[R, Ncmpd, R, Afsd, Ncfsi, R, Ncnsi, R, Ncfsi,...","[, Definite=Def|Gender=Masc|Number=Plur, , Def...","[case, root, case, amod, nmod, case, nmod, cas...",31
5,Сред гостите на официалната церемония по встъп...,1,2,3,4,5,6,7,8,"[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[сред, гост, на, официален, церемония, по, вст...","[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[R, Ncmpd, R, Afsd, Ncfsi, R, Ncnsi, R, Ncfsi,...","[, Definite=Def|Gender=Masc|Number=Plur, , Def...","[case, root, case, amod, nmod, case, nmod, cas...",31
6,Сред гостите на официалната церемония по встъп...,1,2,3,4,5,6,7,8,"[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[сред, гост, на, официален, церемония, по, вст...","[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[R, Ncmpd, R, Afsd, Ncfsi, R, Ncnsi, R, Ncfsi,...","[, Definite=Def|Gender=Masc|Number=Plur, , Def...","[case, root, case, amod, nmod, case, nmod, cas...",31
7,Сред гостите на официалната церемония по встъп...,1,2,3,4,5,6,7,8,"[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[сред, гост, на, официален, церемония, по, вст...","[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[R, Ncmpd, R, Afsd, Ncfsi, R, Ncnsi, R, Ncfsi,...","[, Definite=Def|Gender=Masc|Number=Plur, , Def...","[case, root, case, amod, nmod, case, nmod, cas...",31
8,Сред гостите на официалната церемония по встъп...,1,2,3,4,5,6,7,8,"[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[сред, гост, на, официален, церемония, по, вст...","[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[R, Ncmpd, R, Afsd, Ncfsi, R, Ncnsi, R, Ncfsi,...","[, Definite=Def|Gender=Masc|Number=Plur, , Def...","[case, root, case, amod, nmod, case, nmod, cas...",31
9,Сред гостите на официалната церемония по встъп...,1,2,3,4,5,6,7,8,"[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[сред, гост, на, официален, церемония, по, вст...","[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[R, Ncmpd, R, Afsd, Ncfsi, R, Ncnsi, R, Ncfsi,...","[, Definite=Def|Gender=Masc|Number=Plur, , Def...","[case, root, case, amod, nmod, case, nmod, cas...",31


## 4 Proof of concept

### 4.1 Define one rule

Define one simple rule and put it in a testing function. The rule is: if the word is a noun, is preceded by a preposition, and ends in _ът_ or _ят_, then the long form of the definite article is incorrect. Note that in reality this rule is not enough to determine the correct usage of the definite article but for the purposes of the POC it will suffice.

In [None]:
def verify_definite_article(doc):
    """Detects incorrect usage of the definite article based on POS tags and word endings"""
    errors = []

    for i, token in enumerate(doc):
        # Check if the token is a noun with a definite article (full form)
        if token.pos_ == "NOUN" and token.text.endswith(("ът", "ят")):
            # Check if the preceding token is a preposition (ADP), like "в"
            if i > 0 and doc[i - 1].pos_ == "ADP":
                errors.append(f"Incorrect usage of full definite article: '{token.text}' in sentence: '{doc.text}'")
    return errors

### 4.2 Test and validate

In [None]:
valid_sentences = ["Аз отивам в офиса.", "Иван отива в офиса.", "Офисът е в София."]
invalid_sentences = ["Аз отивам в офисът.", "Иван отива в офисът.", "Офиса е в София."]

In [None]:
# Process each sentence with both libraries
results_st = []
results_ud = []
for sentence in valid_sentences + invalid_sentences:
    doc_st = nlp_st(sentence)
    doc_ud = nlp_ud(sentence)

    results_st.append(verify_definite_article(doc_st))
    results_ud.append(verify_definite_article(doc_ud))

In [None]:
print("Results from Stanza:")
for i, res in enumerate(results_st):
    print(f"{i+1}:", res)

print("\nResults from UDPipe:")
for i, res in enumerate(results_ud):
    print(f"{i+1}:", res)

Results from Stanza:
1: []
2: []
3: []
4: ["Incorrect usage of full definite article: 'офисът' in sentence: 'Аз отивам в офисът.'"]
5: ["Incorrect usage of full definite article: 'офисът' in sentence: 'Иван отива в офисът.'"]
6: []

Results from UDPipe:
1: []
2: []
3: []
4: ["Incorrect usage of full definite article: 'офисът' in sentence: 'Аз отивам в офисът. '"]
5: ["Incorrect usage of full definite article: 'офисът' in sentence: 'Иван отива в офисът. '"]
6: []


Of course we can't expect much correctness of this simple rule but we can see that it did find some incorrect usages in two out of the six cases. Not bad, actually!

## 5 Load and clean the testing data

This short set of sentences contains some common misuses of the definite article and also the correct usage. We will use this set for initial testing of the rules we develop.

In [None]:
test_set_1 = pd.read_csv("data/test_set_1.csv")

In [None]:
test_set_1.head()

Unnamed: 0,error_type,incorrect,correct
0,article_misuse,Ключа е на масата.,Ключът е на масата.
1,article_misuse,Царя пие вино.,Царят пие вино.
2,article_misuse,Ученика е умен и трудолюбив.,Ученикът е умен и трудолюбив.
3,article_misuse,Приятеля ми е в чужбина.,Приятелят ми е в чужбина.
4,article_misuse,Госта пристигна.,Гостът пристигна.


### 5.1 Trim the sentences of whitespaces

In [None]:
test_set_1[['incorrect', 'correct']] = test_set_1[['incorrect', 'correct']].map(str.strip)

### 5.2 Add column with the differing words

In order to perform automated tests we will need to know were exactly is the error in the sentence, if there is an error. Therefore, we'll add a new column with the expected incorrect words. Before that we need to do some cleaning like removal of punctuation. Note that this function is designed to work with Bulgarian in order to keep words 🟠

In [None]:
def clean_sentence_bg(sentence):
    """Cleans a Bulgarian sentence by removing unwanted punctuation but preserving valid dashes."""
    # Remove dashes not surrounded by exactly two bg letters
    # keep also numbers in order to preserve words like 5-годишен
    sentence = re.sub(r"(?<![0-9а-яА-Я])-|-(?![а-яА-Я])", "", sentence)

    # Remove all other punctuation except for valid dashes
    sentence = re.sub(r"[^\w\s-]", "", sentence)

    return sentence

In [None]:
def get_differing_words(row):
    """Extracts differing words between two sentences and logs word count discrepancies."""
    # Clean and split the sentences into words
    incorrect_words = clean_sentence_bg(row["incorrect"]).split()
    correct_words = clean_sentence_bg(row["correct"]).split()

    # Log if word count differs
    if len(incorrect_words) != len(correct_words):
        print(f"Word count differs: Incorrect - {len(incorrect_words)} words, Correct - {len(correct_words)} words")

    # Get words from the correct sentence that are not in the incorrect sentence
    differing_correct_words = [word for word in correct_words if word not in incorrect_words]
    differing_incorrect_words = [word for word in incorrect_words if word not in correct_words]

    return ",".join(differing_incorrect_words), ",".join(differing_correct_words)

In [None]:
# add a new column to hold the correct and incorrect words, they will be used for automated testing
test_set_1[["correct_words", "incorrect_words"]] = test_set_1.apply(get_differing_words, axis=1, result_type="expand")

In [None]:
test_set_1.head(3)

Unnamed: 0,error_type,incorrect,correct,correct_words,incorrect_words
0,article_misuse,Ключа е на масата.,Ключът е на масата.,Ключа,Ключът
1,article_misuse,Царя пие вино.,Царят пие вино.,Царя,Царят
2,article_misuse,Ученика е умен и трудолюбив.,Ученикът е умен и трудолюбив.,Ученика,Ученикът


### 5.3 Melt and sort

Merge the columns _incorrect_ and _correct_ to a single column _text_. Keep each pair of incorrect and correct sentences together.

In [None]:
def melt_and_sort_sentences(df):
    """Transforms a DataFrame to melt sentence pairs, ensuring incorrect and correct sentences are paired."""

    # Swap 'incorrect_words' and 'correct_words' columns
    df = df.rename(columns={"incorrect_words": "temp_correct_words", "correct_words": "incorrect_words"})
    df = df.rename(columns={"temp_correct_words": "correct_words"})

    # Add an order column to keep track of the original order
    df["order"] = df.index

    # Melt the DataFrame
    df_melted = pd.melt(
        df,
        id_vars=["correct_words", "incorrect_words", "order"],
        value_vars=["incorrect", "correct"],
        var_name="is_correct",
        value_name="text",
    )

    # Set 'is_correct' to True for 'correct' and False for 'incorrect'
    df_melted["is_correct"] = df_melted["is_correct"].apply(lambda x: x == "correct")

    # Assign incorrect_words and correct_words only to incorrect sentences
    df_melted["incorrect_words"] = df_melted.apply(lambda row: row["incorrect_words"] if not row["is_correct"] else "", axis=1)
    df_melted["correct_words"] = df_melted.apply(lambda row: row["correct_words"] if not row["is_correct"] else "", axis=1)

    # Sort by 'order' to ensure the pairs of correct/incorrect sentences are next to each other
    df_melted = df_melted.sort_values(by=["order", "is_correct"], ascending=[True, True])

    # Reset index for clean output
    df_melted.reset_index(drop=True, inplace=True)

    # Drop the 'order' column as it's no longer needed
    df_melted = df_melted.drop(columns=["order"])

    # Reorder columns to match the desired output
    df_melted = df_melted[["text", "is_correct", "incorrect_words", "correct_words"]]

    return df_melted

In [None]:
test_set_1 = melt_and_sort_sentences(test_set_1)

In [None]:
test_set_1.head(6)

Unnamed: 0,text,is_correct,incorrect_words,correct_words
0,Ключа е на масата.,False,Ключа,Ключът
1,Ключът е на масата.,True,,
2,Царя пие вино.,False,Царя,Царят
3,Царят пие вино.,True,,
4,Ученика е умен и трудолюбив.,False,Ученика,Ученикът
5,Ученикът е умен и трудолюбив.,True,,


### 5.4 Add columns with Stanza NLP features

In [None]:
if True:
    # extract stanza features as new columns
    feature_columns_st = ["pos", "tag", "dep", "morph", "lemmas", "left_edge", "right_edge", "num_tokens"]
    test_set_1[feature_columns_st] = test_set_1.apply(lambda r: extract_features(nlp_st, r, "text"), axis=1, result_type="expand")

    # save in case we want to load it faster
    test_set_1.to_csv("data/test_set_1_clean.csv", index=None)
else:
    # load from file instead of the above, it is faster
    test_set_1 = pd.read_csv("data/test_set_1_clean.csv")
    test_set_1["correct_words"] = test_set_1["incorrect_words"].fillna("")
    test_set_1["incorrect_words"] = test_set_1["incorrect_words"].fillna("")

In [None]:
test_set_1.columns

Index(['text', 'is_correct', 'incorrect_words', 'correct_words', 'pos', 'tag',
       'dep', 'morph', 'lemmas', 'left_edge', 'right_edge', 'num_tokens'],
      dtype='object')

In [None]:
test_set_1[["text", "num_tokens", "pos", "tag", "dep", "morph", "lemmas", "left_edge", "right_edge"]].head(1).T

Unnamed: 0,0
text,Ключа е на масата.
num_tokens,5
pos,"NOUN,AUX,ADP,NOUN,PUNCT"
tag,"Ncmsh,Vxitf-r3s,R,Ncfsd,punct"
dep,"nsubj,cop,case,root,punct"
morph,"Definite=Def|Gender=Masc|Number=Sing,Aspect=Im..."
lemmas,"ключ,съм,на,маса,."
left_edge,"Ключа,е,на,Ключа,."
right_edge,"Ключа,е,на,.,."


## 6 Define rule based logic

### 6.1 Implement NLP specific functions

This section contains functions that extract grammatical details like gender, number, and sentence role from tagged words.

In [None]:
def extract_gender_from_tag(tag):
    gender = " "
    # gender is applicable to common noun, proper noun or adjective
    if tag.startswith(("Nc", "Np", "A")):
        if "m" in tag:
            gender = "Masculine"
        elif "f" in tag:
            gender = "Feminine"
        elif "n" in tag:
            gender = "Neutral"
    return gender


# TODO: need to exclude triple character patterns like "p1s": "Past tense, 1st person sng",
def extract_number_from_tag(tag):
    number = None
    if "s" in tag:
        number = "Singular"
    elif "p" in tag:
        number = "Plural"
    return number


def is_masculine(row_dict, i):
    tag = row_dict["tag"].split(",")[i]
    return extract_gender_from_tag(tag) == "Masculine"


def is_singular(row_dict, i):
    tag = row_dict["tag"].split(",")[i]
    return extract_number_from_tag(tag) == "Singular"


def is_masculine_singular(row_dict, i):
    return is_masculine(row_dict, i) and is_singular(row_dict, i)


def is_dep_subject(row_dict, i):
    dep = row_dict["dep"].split(",")[i]
    return dep in {"nsubj", "csubj", "nsubj:pass", "csubj:pass"}


def nlp_get_pos(row_dict, i):
    return row_dict["pos"].split(",")[i]


def nlp_get_dep(row_dict, i):
    return row_dict["dep"].split(",")[i]


def nlp_get_lemma(row_dict, i):
    return row_dict["lemmas"].split(",")[i]


def nlp_get_article(row_dict, i):
    morph = row_dict["morph"].split(",")[i]
    if "Definite=Def" in morph:
        return "definite"
    elif "Definite=Ind" in morph:
        return "indefinite"
    else:
        return ""

In [None]:
# for quick viewing the features
# use updpipe object for the extraction of the word because of performance
# use stanza object for determining features because of accuracy
def inspect_word(row_dict, word_num):
    word = nlp_ud(row_dict["text"])[word_num].text
    pos = row_dict["pos"].split(",")[word_num]
    tag = row_dict["tag"].split(",")[word_num]
    dep = row_dict["dep"].split(",")[word_num]
    gen_s = extract_gender_from_tag(tag)[0:1]  # M - Masculine, F - Feminine, N - Neutral
    num_s = extract_number_from_tag(tag)[0:1]

    return f"{row_dict['index']} {word:<15} POS:{pos:<{10}} Gen:{gen_s}    Num:{num_s}   DEP:{dep:<{10}} Sent: {row_dict["text"]:<30}"

### 6.2 Implement generic testing function

Before we start implementing any rules, we should first create a testing function for them. Since we don’t have the specific rule functions ready, we’ll set up a general testing function - actually, we’ll create three functions - that can take in a list of conditions and a list of rules. It’s important that both lists have the same length. The testing function will process text, apply these conditions and rules to detect errors, and then compare the results with expected outcomes, logging whether each test passes or fails.

In [None]:
def all_rules(row_dict, condition_fns, rule_fns):
    # ⚠️Make sure the sentence always ends with a punctuation. This is a workaround!
    # UDPipe not always separates into tokens correctly, sometimes reports the word+punctuation (or special symbol) as single token.
    # Stanza has bad performance and this function is called iteratively multiple times, not a good idea to use Stanza here.
    sentence = row_dict["text"] + "."
    doc = nlp_ud(sentence)
    errors = []

    # ⚠️This is a very ugly workaround. UDPipe does not correctly split into tokens when there is punctuation
    # in the middle of the sentence. We return "no errors" here which might make some tests fail but
    # the goal is not to raise an exception. The correct solution is not to use UDPipe at all and pre-calculate the tokens
    # in the beginning.
    if (len(doc) != row_dict['num_tokens']):
        return errors

    # Iterate over token pairs with indices
    # for i in range(len(doc) - 1):
    for i in range(row_dict['num_tokens'] - 1):
        token1 = doc[i].text
        token2 = doc[i + 1].text

        conditions = [condition_fn(row_dict, i) for condition_fn in condition_fns]

        for condition, rule_fn in zip(conditions, rule_fns):
            if condition:
                errors += rule_fn(token1, token2, i)

    return errors

In [None]:
def test_definite_article(row_dict, condition_fns, rule_fns):
    actual_wrong_words = []
    expected_wrong_words = [word.strip() for word in row_dict["incorrect_words"].split(",") if word.strip() != ""]
    actual_wrong_words = all_rules(row_dict, condition_fns, rule_fns)

    test_res = "Pass" if set(actual_wrong_words) == set(expected_wrong_words) else "Fail"

    return (test_res, actual_wrong_words)

In [None]:
# tests all sentences from a dataset that already contains NLP tags
def test_definite_article_all(nlp_df, conditions_fns, rules_fns, print_passed=True, print_failed=True, print_total=True):
    """Tests definite articles in sentences and logs results."""
    idx_failed, idx_passed = [], []

    for i, row in nlp_df.iterrows():
        row_dict = row.to_dict()
        res = test_definite_article(row_dict, conditions_fns, rules_fns)
        status, incorrect = res[0], res[1]

        if status == "Fail":
            idx_failed.append(i)
            if print_failed:
                print(i, f'❌ {row_dict["text"]} (Actual: \'{",".join(incorrect)}\', Expected: \'{row_dict["incorrect_words"]}\')')
        else:
            idx_passed.append(i)
            message = f'✅ {row_dict["text"]} (\'{",".join(incorrect)}\' is incorrect)' if incorrect else f'✅ {row_dict["text"]} (The sentence is correct)'
            if print_passed:
                print(i, message)

    if print_total:
        n_failed = len(idx_failed)
        print("✅ All tests passed" if n_failed == 0 else f"❌ {n_failed}/{len(nlp_df)} failed.")

    return idx_passed, idx_failed

### 6.3 Iteration 1

#### 6.3.1 Implement Rule1

According to Rule1, long definite article should be used when the noun takes the role of the subject. Additionally, in order to exclude groups of adjective + noun, which fall under Rule3, we need to exclude noun that follow an adjective.

In [None]:
def cond_rule1(row_dict, i):
    """Searches for a NOUN, masculine, singular, which is the SUBJECT (nominal or clausal) in the sentence,
       and is not preceded by an ADJective"""
    is_token1_adj = nlp_get_pos(row_dict, i) == "ADJ"
    is_token1_noun = nlp_get_pos(row_dict, i) == "NOUN"
    is_token1_masc_sg = is_masculine_singular(row_dict, i)
    is_token1_subj = is_dep_subject(row_dict, i)

    is_token2_noun = nlp_get_pos(row_dict, i + 1) == "NOUN"
    is_token2_masc_sg = is_masculine_singular(row_dict, i + 1)
    is_token2_subj = is_dep_subject(row_dict, i + 1)

    # if token is masculine singular noun and subj, and the previous token is not an adjective
    if i == 0 and is_token1_masc_sg and is_token1_noun and is_token1_subj:
        return True
    if i != 0 and not is_token1_adj and is_token2_masc_sg and is_token2_noun and is_token2_subj:
        return True

    return False

In [None]:
def rule1(token1, token2, token_idx):
    errors = []
    word = token1 if token_idx == 0 else token2
    if not word.endswith(("ът", "ят")):
        errors.append(word)
    return errors

#### 6.3.2 Test Rule1

In [None]:
idx_passed, idx_current = test_definite_article_all(test_set_1, [cond_rule1], [rule1])

0 ✅ Ключа е на масата. ('Ключа' is incorrect)
1 ✅ Ключът е на масата. (The sentence is correct)
2 ✅ Царя пие вино. ('Царя' is incorrect)
3 ✅ Царят пие вино. (The sentence is correct)
4 ✅ Ученика е умен и трудолюбив. ('Ученика' is incorrect)
5 ✅ Ученикът е умен и трудолюбив. (The sentence is correct)
6 ✅ Приятеля ми е в чужбина. ('Приятеля' is incorrect)
7 ✅ Приятелят ми е в чужбина. (The sentence is correct)
8 ✅ Госта пристигна. ('Госта' is incorrect)
9 ✅ Гостът пристигна. (The sentence is correct)
10 ❌ Готвача приготви обяда. (Actual: '', Expected: 'Готвача')
11 ✅ Готвачът приготви обяда. (The sentence is correct)
12 ✅ Влака спря на гара София. ('Влака' is incorrect)
13 ✅ Влакът спря на гара София. (The sentence is correct)
14 ✅ Учителя говореше за Стефан Стамболов. ('Учителя' is incorrect)
15 ✅ Учителят говореше за Стефан Стамболов. (The sentence is correct)
16 ❌ Таня търси лекарят. (Actual: '', Expected: 'лекарят')
17 ✅ Таня търси лекаря. (The sentence is correct)
18 ❌ Говорим за ле

##### Analyze some of the failures

Now let's taкe the first several failed tests and figure out why they failed.

In [None]:
def row_to_dict(df, index):
    row_dict = df.loc[index].to_dict()
    row_dict["index"] = index
    return row_dict

In [None]:
res = []
res.append(inspect_word(row_to_dict(test_set_1, 10), 0))  # 👉Готвача приготви обяда.
res.append(inspect_word(row_to_dict(test_set_1, 16), 2))  # Таня търси 👉лекарят.
res.append(inspect_word(row_to_dict(test_set_1, 18), 2))  # Говорим за 👉лекарят.
res.append(inspect_word(row_to_dict(test_set_1, 20), 5))  # Таня търси Ангел Иванчев, 👉лекарят.
res.append(inspect_word(row_to_dict(test_set_1, 22), 5))  # Говорим за Ангел Иванчев, 👉лекарят.
res.append(inspect_word(row_to_dict(test_set_1, 24), 3))  # Той ми даде 👉ключът от къщата.
print("\n".join(res))

10 Готвача         POS:NOUN       Gen:F    Num:S   DEP:nsubj      Sent: Готвача приготви обяда.       
16 лекарят         POS:NOUN       Gen:M    Num:S   DEP:nsubj      Sent: Таня търси лекарят.           
18 лекарят         POS:NOUN       Gen:M    Num:S   DEP:iobj       Sent: Говорим за лекарят.           
20 лекарят         POS:NOUN       Gen:M    Num:S   DEP:nmod       Sent: Таня търси Ангел Иванчев, лекарят.
22 лекарят         POS:NOUN       Gen:M    Num:S   DEP:conj       Sent: Говорим за Ангел Иванчев, лекарят.
24 ключът          POS:NOUN       Gen:M    Num:S   DEP:obj        Sent: Той ми даде ключът от къщата. 


The problem with _**готвача**_ is that it is considered to be feminine by Stanza. For now we **will postpone fixing** because without context it is impossible to determine the gender, and without gender we can't apply Rule1.

The word _**лекарят**_ in _Таня търси лекарят_ is determined as an nsubj (nominal subject) by Stanza but the correct dependency is object. Well, grammatically it would be correct if we assume that the doctor is the doer in the sentence. However, that is the less popular word order. Let's **postpone fixing** this sentence for later.

The word _**Лекарят**_ in the sentences \
_Говорим за лекарят_, \
_Таня търси Ангел Иванчев, лекарят_ \
and _Говорим за Ангел Иванчев, лекарят._ \
is determined to be iobj (indirect object), nmod (nominal modifier) and conj (conjust) so that falls under Rule2.

_**ключът**_ is marked as object so it also falls under Rule2.

So the last 4 failures are expected to be fixed after we implement Rule2.

Separate the failed and analyzed tests into one list and the passed tests into another list. The one with the passed tests will be used for regression testing.

In [None]:
idx_passed = sorted(list(set(idx_passed) - set([10, 16])))
idx_current = sorted(list(set(idx_current) - set([10, 16])))

### 6.4 Iteration 2

#### 6.4.1 Implement Rule2

Here we will implement the first part of Rule2, which states that when the noun takes the role of the object, then the short form of the definite article should be used.

In [None]:
def cond_rule2(row_dict, i):
    """Searches for masculine singular nouns that are the object in the sentence"""
    is_token1_noun = nlp_get_pos(row_dict, i) == "NOUN"
    is_token1_masc_sg = is_masculine_singular(row_dict, i)
    is_token1_subj = is_dep_subject(row_dict, i)

    if is_token1_noun and is_token1_masc_sg and not is_token1_subj:
        return True
    return False

In [None]:
def rule2(token1, token2, idx_token):
    errors = []
    if token1.endswith(("ът", "ят")):  # objects should not have full article
        errors.append(token1)
    return errors

#### 6.4.2 Test Rule2

In [None]:
p, c = test_definite_article_all(test_set_1.loc[idx_current], [cond_rule1, cond_rule2], [rule1, rule2])

18 ✅ Говорим за лекарят. ('лекарят' is incorrect)
20 ✅ Таня търси Ангел Иванчев, лекарят. ('лекарят' is incorrect)
22 ✅ Говорим за Ангел Иванчев, лекарят. ('лекарят' is incorrect)
24 ✅ Той ми даде ключът от къщата. ('ключът' is incorrect)
26 ❌ Принца говори с царят. (Actual: 'царят', Expected: 'Принца,царят')
28 ✅ Аз помагам на ученикът. ('ученикът' is incorrect)
30 ✅ Много пътници слязоха от влакът. ('влакът' is incorrect)
32 ✅ Слушахме с интерес учителят. ('учителят' is incorrect)
34 ❌ Дългоочаквания гост дойде. (Actual: '', Expected: 'Дългоочаквания')
36 ❌ Иван, госта от Сопот, дойде. (Actual: '', Expected: 'госта')
38 ❌ Иван е съученика ми. (Actual: '', Expected: 'съученика')
40 ❌ Иван е добрия. (Actual: '', Expected: 'добрия')
42 ❌ Иван е успелия. (Actual: '', Expected: 'успелия')
44 ❌ Пешо се оказа най-верния му приятел. (Actual: '', Expected: 'най-верния')
46 ❌ Пешо се оказа дарителя на училището. (Actual: '', Expected: 'дарителя')
48 ❌ Ученикът на първия чин изглежда най-доволн

In the previous run rows 18, 20, 22 and 24 failed, now after implementing Rule2 they pass. Additionally, 4 more tests pass.

##### Analyze some of the failures

In [None]:
res = []
res.append(inspect_word(row_to_dict(test_set_1, 26), 0))  # 👉Принца говори с царят.
res.append(inspect_word(row_to_dict(test_set_1, 34), 0))  # 👉Дългоочаквания гост дойде.
res.append(inspect_word(row_to_dict(test_set_1, 34), 1))  # Дългоочаквания 👉гост дойде.
res.append(inspect_word(row_to_dict(test_set_1, 36), 2))  # Иван, 👉госта от Сопот, дойде.
res.append(inspect_word(row_to_dict(test_set_1, 38), 2))  # Иван е 👉съученика ми.
res.append(inspect_word(row_to_dict(test_set_1, 40), 2))  # Иван е 👉добрия.
res.append(inspect_word(row_to_dict(test_set_1, 42), 2))  # Иван е 👉успелия.
print("\n".join(res))

26 Принца          POS:NOUN       Gen:F    Num:S   DEP:nsubj      Sent: Принца говори с царят.        
34 Дългоочаквания  POS:ADJ        Gen:M    Num:S   DEP:amod       Sent: Дългоочаквания гост дойде.    
34 гост            POS:NOUN       Gen:M    Num:S   DEP:nsubj      Sent: Дългоочаквания гост дойде.    
36 госта           POS:NOUN       Gen:M    Num:S   DEP:nmod       Sent: Иван, госта от Сопот, дойде.  
38 съученика       POS:NOUN       Gen:M    Num:S   DEP:root       Sent: Иван е съученика ми.          
40 добрия          POS:ADJ        Gen:M    Num:S   DEP:root       Sent: Иван е добрия.                
42 успелия         POS:VERB       Gen:     Num:S   DEP:root       Sent: Иван е успелия.               


* _Принца_ - considered feminine, which is wrong, let's postpone fixing for now.
* _Дългоочаквания_  - will be fixed by Rule3
* _гост_            - also will be fixed by Rule3
* _госта_           - add to later (not sure yet which rule to apply)
* _съученика_       - add to later (because root)
* _добрия_          - add to later (because root)
* _успелия_         - Stanza thinks this is a VERB, which is wrong, so let's add to the list of "will not fix"

In [None]:
# keep the lists of passed and current tests up to date
idx_failed = [26, 36, 38, 40, 42]
idx_passed = sorted(set(idx_passed + p))
idx_passed = sorted(list(set(idx_passed) - set(idx_failed)))
idx_current = sorted(list(set(idx_current) - set(p)))
idx_current = sorted(list(set(idx_current) - set(idx_failed)))

#### 6.4.3 Regression Test

Test the previously successful sentences to determine if Rule2 caused any breakages. Perform the regression using all rules defined till now (Rule1, Rule2)

In [None]:
_ = test_definite_article_all(test_set_1.loc[idx_passed], [cond_rule1, cond_rule2], [rule1, rule2], print_passed=False)

37 ❌ Иван, гостът от Сопот, дойде. (Actual: 'гостът', Expected: '')
39 ❌ Иван е съученикът ми. (Actual: 'съученикът', Expected: '')
59 ❌ Приятелят ни е лекарят. (Actual: 'лекарят', Expected: '')
61 ❌ Ангел Иванчев, лекарят, живее тука. (Actual: 'лекарят', Expected: '')
63 ❌ Приятелят ни е Ангел Иванчев, лекарят. (Actual: 'Приятелят,лекарят', Expected: '')
❌ 5/58 failed.


##### Analyze results from the regression

In [None]:
res = []
res.append(inspect_word(row_to_dict(test_set_1, 37), 2))  # Иван, 👉гостът от Сопот, дойде.
res.append(inspect_word(row_to_dict(test_set_1, 39), 2))  # Иван е 👉съученикът ми.
res.append(inspect_word(row_to_dict(test_set_1, 59), 3))  # Приятелят ни е 👉лекарят.
res.append(inspect_word(row_to_dict(test_set_1, 61), 3))  # Ангел Иванчев, 👉лекарят, живее тука.
res.append(inspect_word(row_to_dict(test_set_1, 63), 6))  # Приятелят ни е Ангел Иванчев, 👉лекарят.
print("\n".join(res))

37 гостът          POS:NOUN       Gen:M    Num:S   DEP:nmod       Sent: Иван, гостът от Сопот, дойде. 
39 съученикът      POS:NOUN       Gen:M    Num:S   DEP:root       Sent: Иван е съученикът ми.         
59 лекарят         POS:NOUN       Gen:M    Num:S   DEP:root       Sent: Приятелят ни е лекарят.       
61 лекарят         POS:NOUN       Gen:M    Num:S   DEP:nmod       Sent: Ангел Иванчев, лекарят, живее тука.
63 лекарят         POS:NOUN       Gen:M    Num:S   DEP:conj       Sent: Приятелят ни е Ангел Иванчев, лекарят.


When we implement the next rules these failures should be fixed. For now let's move them to the list with failed tests. We'll get back to them later.

In [None]:
idx_failed = [37, 39, 59, 61, 63]
idx_passed = sorted(list(set(idx_passed) - set(idx_failed)))

### 6.5 Iteration 3

#### 6.5.1 Implement Rule3

Rule3 states that adjectives, numerals, participles and possessive pronouns take the same article as the noun they agree with. Let's first implement the rules for adjectives.

In [None]:
# Пълен член се появява в цялата група на подлога, към която принадлежат неговите определения или приложения. [3]
# In adjective-noun phrases, only the adjective takes a definite article ending.
def cond_rule3(row_dict, i):
    """Searches for ADJ/amod + NOUN/nsubj, both masculine/singular"""
    is_token1_adj = nlp_get_pos(row_dict, i) == "ADJ"
    is_token1_masc_sg = is_masculine_singular(row_dict, i)
    is_token1_amod = nlp_get_dep(row_dict, i) == "amod"

    is_token2_noun = nlp_get_pos(row_dict, i + 1) == "NOUN"
    is_token2_masc_sg = is_masculine_singular(row_dict, i + 1)
    is_token2_nsubj = nlp_get_dep(row_dict, i + 1) == "nsubj"

    if is_token1_adj and is_token2_noun:
        if is_token1_masc_sg and is_token2_masc_sg:
            if is_token1_amod and is_token2_nsubj:
                return True
    return False

In [None]:
def rule3(token1, token2, idx_token):
    errors = []

    # Дългоочаквания гост
    if token1.endswith(("я")):  # прилагателното трябва да е с пълен член
        errors.append(token1)

    # Дългоочакваният гост
    if token2.endswith(("ът", "ят")):  # съществителното трябва да е с непълен член
        errors.append(token2)

    return errors

#### 6.5.2 Test Rule3

In [None]:
conditions = [cond_rule1, cond_rule2, cond_rule3]
rules = [rule1, rule2, rule3]
p, c = test_definite_article_all(test_set_1.loc[idx_current], conditions, rules)

34 ✅ Дългоочаквания гост дойде. ('Дългоочаквания' is incorrect)
44 ❌ Пешо се оказа най-верния му приятел. (Actual: '', Expected: 'най-верния')
46 ❌ Пешо се оказа дарителя на училището. (Actual: '', Expected: 'дарителя')
48 ❌ Ученикът на първия чин изглежда най-доволния от всички. (Actual: '', Expected: 'най-доволния')
50 ❌ Малкия, заповядай едно бонбонче! (Actual: '', Expected: 'Малкия')
58 ❌ Приятеля ни е лекаря. (Actual: 'Приятеля', Expected: 'Приятеля,лекаря')
60 ❌ Ангел Иванчев, лекаря, живее тука. (Actual: '', Expected: 'лекаря')
62 ❌ Приятеля ни е Ангел Иванчев, лекаря. (Actual: '', Expected: 'Приятеля,лекаря')
64 ❌ Той е най-високия и хубав в стаята. (Actual: '', Expected: 'най-високия')
66 ❌ Аз живея в новият бял блок. (Actual: '', Expected: 'новият')
68 ❌ Високия бял блок е нов. (Actual: '', Expected: 'Високия')
72 ❌ Ученикът, а не учителят извика при себе си директорът. (Actual: 'учителят,директорът', Expected: 'Ученикът,учителят')
73 ❌ Ученика, а не учителя извика при себе с

##### Analyze the test

In [None]:
res = []
res.append(inspect_word(row_to_dict(test_set_1, 44), 3))  # Пешо се оказа 👉най-верния му приятел.
res.append(inspect_word(row_to_dict(test_set_1, 46), 3))  # Пешо се оказа 👉дарителя на училището.
res.append(inspect_word(row_to_dict(test_set_1, 48), 5))  # Ученикът на първия чин изглежда 👉най-доволния от всички.
res.append(inspect_word(row_to_dict(test_set_1, 50), 0))  # 👉Малкия, заповядай едно бонбонче!
res.append(inspect_word(row_to_dict(test_set_1, 58), 0))  # 👉Приятеля ни е 👉лекаря.
res.append(inspect_word(row_to_dict(test_set_1, 60), 3))  # Ангел Иванчев, 👉лекаря, живее тука.
res.append(inspect_word(row_to_dict(test_set_1, 62), 0))  # 👉Приятеля ни е Ангел Иванчев, 👉лекаря.
print("\n".join(res))

44 най-верния      POS:ADJ        Gen:M    Num:S   DEP:amod       Sent: Пешо се оказа най-верния му приятел.
46 дарителя        POS:NOUN       Gen:M    Num:S   DEP:obj        Sent: Пешо се оказа дарителя на училището.
48 най-доволния    POS:ADJ        Gen:M    Num:S   DEP:obj        Sent: Ученикът на първия чин изглежда най-доволния от всички.
50 Малкия          POS:ADJ        Gen:M    Num:S   DEP:vocative   Sent: Малкия, заповядай едно бонбонче!
58 Приятеля        POS:NOUN       Gen:M    Num:S   DEP:nsubj      Sent: Приятеля ни е лекаря.         
60 лекаря          POS:NOUN       Gen:M    Num:S   DEP:nmod       Sent: Ангел Иванчев, лекаря, живее тука.
62 Приятеля        POS:NOUN       Gen:M    Num:S   DEP:root       Sent: Приятеля ни е Ангел Иванчев, лекаря.


* 44 _най-верния_ - should be fixed by Rule4
* 46 _дарителя_ - should be fixed by Rule4
* 48 _най_доволния_ - should be fixed by Rule4
* 50 _Малкия_ - should be fixed by Rule_6
* 58 _Приятеля_ on row 58 - should be fixed by Rule_7
* 60 _лекаря_ - should be fixed by Rule_7
* 62 _Приятеля_ on row 62 - should be fixed by Rule_7

In [None]:
# keep the lists with passing and current tests up to date
idx_failed = [50, 58, 60, 62]
idx_passed = sorted(set(idx_passed + p))
idx_passed = sorted(list(set(idx_passed) - set(idx_failed)))
idx_current = sorted(list(set(idx_current) - set(p)))
idx_current = sorted(list(set(idx_current) - set(idx_failed)))

#### 6.5.3 Regression Test

Ensure that we have not violated Rule 1 and Rule 2:

In [None]:
_ = test_definite_article_all(test_set_1.loc[idx_passed], conditions, rules, print_passed=False)

✅ All tests passed


### 6.6 Iteration 4

#### 6.6.1 Implement Rule4

According to Rule4, the full definite article should be used when a noun is after verbs like _съм_, _бъда_, _оказвам се_, _изглеждам_, etc. In order to check the previous verb, we need to check the _lemma_ of the verb.

First let's see what the previously failed sentences look like.

In [None]:
inspect_spacy_doc(nlp_st(test_set_1.loc[39, "text"]))
inspect_spacy_doc(nlp_st(test_set_1.loc[44, "text"]))
inspect_spacy_doc(nlp_st(test_set_1.loc[46, "text"]))
inspect_spacy_doc(nlp_st(test_set_1.loc[48, "text"]))

Token: Иван            Tag: Npmsi           POS: PROPN      Lemma: иван            Dep: nsubj     
Token: е               Tag: Vxitf-r3s       POS: AUX        Lemma: съм             Dep: cop       
Token: съученикът      Tag: Ncmsf           POS: NOUN       Lemma: съученик        Dep: root      
Token: ми              Tag: Psot--1         POS: PRON       Lemma: аз              Dep: det       
Token: .               Tag: punct           POS: PUNCT      Lemma: .               Dep: punct     


Token: Пешо            Tag: Npmsi           POS: PROPN      Lemma: пешо            Dep: nsubj     
Token: се              Tag: Ppxta           POS: PRON       Lemma: се              Dep: expl      
Token: оказа           Tag: Vpptf-o3s       POS: VERB       Lemma: окажа-(се)      Dep: root      
Token: най-верния      Tag: Amsh            POS: ADJ        Lemma: верен           Dep: amod      
Token: му              Tag: Psot--3--m      POS: PRON       Lemma: мой             Dep: det       
Token: п

<img src="images/rule4_pattern_new.png" width="800">

The above pattern shows that the first token is a verb (including auxiliary), while the second token is either a noun or an adjective, both of which must be in masculine singular form.

In [None]:
# Пълен е членът и на името, употребено след глаголи като съм, бъда, оказвам се, изглеждам и др. [3]
def cond_rule4(row_dict, i):
    is_token1_aux = nlp_get_pos(row_dict, i) == "AUX"
    is_token1_verb = nlp_get_pos(row_dict, i) == "VERB"

    is_token2_noun = nlp_get_pos(row_dict, i + 1) == "NOUN"
    is_token2_adj = nlp_get_pos(row_dict, i + 1) == "ADJ"
    is_token2_masc_sg = is_masculine_singular(row_dict, i + 1)
    is_article_indefinite = nlp_get_article(row_dict, i + 1) == "indefinite"

    is_lemma_aux_syn = False

    if is_token2_masc_sg:
        if is_token2_noun or (is_token2_adj and not is_article_indefinite):
            if is_token1_aux or is_token1_verb:
                doc = nlp_st(row_dict["text"])
                lemma = doc[i].lemma_
                # Винаги след глагола „съм“ или след глаголи, които са с преносна употреба и може да бъдат синоними на
                # глагола „съм“ (оказа се, изглежда, казвам се), се пише пълен член (-ът/-ят)
                is_lemma_aux_syn = lemma in ("е", "съм", "окажа-(се)", "изглежда")
                if is_lemma_aux_syn:
                    return True
    return False

In [None]:
def rule4(token1, token2, idx_token):
    errors = []

    # е        съученикът
    # оказа    най_верния
    # оказа    дарителя
    # изглежда най-доволния
    if not token2.endswith(("ът", "ят")):  # след глагол думата трябва да е с пълен опр. член
        errors.append(token2)

    return errors

#### 6.6.2 Test Rule4

In [None]:
conditions = [cond_rule1, cond_rule2, cond_rule3, cond_rule4]
rules = [rule1, rule2, rule3, rule4]

p, c = test_definite_article_all(test_set_1.loc[[39, 44, 46, 48]], conditions, rules)

39 ❌ Иван е съученикът ми. (Actual: 'съученикът', Expected: '')
44 ✅ Пешо се оказа най-верния му приятел. ('най-верния' is incorrect)
46 ✅ Пешо се оказа дарителя на училището. ('дарителя' is incorrect)
48 ✅ Ученикът на първия чин изглежда най-доволния от всички. ('най-доволния' is incorrect)
❌ 1/4 failed.


##### Analyze

We expected row 39 to pass but it is still failing. This is likely due to one of the existing rules. We should run tests on each rule individually to identify which one is causing the issue, specifically the one that returns the word _съученикът_.

In [None]:
p, c = test_definite_article_all(test_set_1.loc[[39]], [cond_rule1], [rule1], print_passed=False)

✅ All tests passed


In [None]:
p, c = test_definite_article_all(test_set_1.loc[[39]], [cond_rule2], [rule2], print_passed=False)

39 ❌ Иван е съученикът ми. (Actual: 'съученикът', Expected: '')
❌ 1/1 failed.


Row 39 doesn't pass due to Rule2. In Rule4 we checked whether the previous token was AUX or a VERB, so in Rule2 we need to introduce an exception that specifies the previous token cannot be a VERB or AUX. Otherwise we will have two rules acting on the same pattern.

#### 6.6.3 Update Rule2 and test again

In [None]:
# keep the old rule if we need to test with it
cond_rule2_prev = cond_rule2
rule2_prev = rule2


def cond_rule2(row_dict, i):
    """Search for either:
        - single, masculine NOUN, not subject, not preceded by a verb
        - preceding token is a verb but different from съм, оказвам се, излгежда, etc."""
    is_token1_noun = nlp_get_pos(row_dict, i) == "NOUN"
    is_token1_verb = nlp_get_pos(row_dict, i) == "VERB"
    is_token1_aux = nlp_get_pos(row_dict, i) == "AUX"
    is_token1_masc_sg = is_masculine_singular(row_dict, i)
    is_token1_subj = is_dep_subject(row_dict, i)

    is_token2_noun = nlp_get_pos(row_dict, i + 1) == "NOUN"
    is_token2_subj = is_dep_subject(row_dict, i + 1)
    is_token2_masc_sg = is_masculine_singular(row_dict, i + 1)

    # if token is masculine single noun and !subj, and the previous token is not a verb
    if i == 0 and is_token1_noun and is_token1_masc_sg and not is_token1_subj:
        return True

    is_like_verb = is_token1_verb or is_token1_aux

    if i != 0 and is_token2_noun and is_token2_masc_sg and not is_token2_subj:
        lemma = nlp_get_lemma(row_dict, i)

        is_lemma_aux_syn = lemma in ("е", "съм", "окажа-(се)", "изглежда")
        is_non_aux_syn = not is_like_verb or (is_like_verb and not is_lemma_aux_syn)
        if is_non_aux_syn:
            return True

    return False

In [None]:
def rule2(token1, token2, token_idx):
    errors = []
    word = token1 if token_idx == 0 else token2
    if word.endswith(("ът", "ят")):  # objects should not have full article
        errors.append(word)

    return errors

First test with a small set:

In [None]:
conditions = [cond_rule1, cond_rule2, cond_rule3, cond_rule4]
rules = [rule1, rule2, rule3, rule4]
p, c = test_definite_article_all(test_set_1.loc[[39, 44, 46, 48]], conditions, rules)

39 ✅ Иван е съученикът ми. (The sentence is correct)
44 ✅ Пешо се оказа най-верния му приятел. ('най-верния' is incorrect)
46 ✅ Пешо се оказа дарителя на училището. ('дарителя' is incorrect)
48 ✅ Ученикът на първия чин изглежда най-доволния от всички. ('най-доволния' is incorrect)
✅ All tests passed


Next test with the remaining rows:

In [None]:
p, c = test_definite_article_all(test_set_1.loc[idx_current], conditions, rules)

44 ✅ Пешо се оказа най-верния му приятел. ('най-верния' is incorrect)
46 ✅ Пешо се оказа дарителя на училището. ('дарителя' is incorrect)
48 ✅ Ученикът на първия чин изглежда най-доволния от всички. ('най-доволния' is incorrect)
64 ✅ Той е най-високия и хубав в стаята. ('най-високия' is incorrect)
66 ❌ Аз живея в новият бял блок. (Actual: '', Expected: 'новият')
68 ❌ Високия бял блок е нов. (Actual: '', Expected: 'Високия')
72 ❌ Ученикът, а не учителят извика при себе си директорът. (Actual: 'учителят,директорът', Expected: 'Ученикът,учителят')
73 ❌ Ученика, а не учителя извика при себе си директорът. (Actual: 'Ученика,директорът', Expected: '')
❌ 4/8 failed.


##### Analyze the remaining failures

In [None]:
inspect_spacy_doc(nlp_st(test_set_1.loc[66, "text"]))
inspect_spacy_doc(nlp_st(test_set_1.loc[68, "text"]))

Token: Аз              Tag: Ppe-os1         POS: PRON       Lemma: аз              Dep: nsubj     
Token: живея           Tag: Vpitf-r1s       POS: VERB       Lemma: живея           Dep: root      
Token: в               Tag: R               POS: ADP        Lemma: в               Dep: case      
Token: новият          Tag: Amsf            POS: ADJ        Lemma: нов             Dep: amod      
Token: бял             Tag: Amsi            POS: ADJ        Lemma: бял             Dep: amod      
Token: блок            Tag: Ncmsi           POS: NOUN       Lemma: блок            Dep: iobj      
Token: .               Tag: punct           POS: PUNCT      Lemma: .               Dep: punct     


Token: Високия         Tag: Amsh            POS: ADJ        Lemma: висок           Dep: amod      
Token: бял             Tag: Amsi            POS: ADJ        Lemma: бял             Dep: amod      
Token: блок            Tag: Ncmsi           POS: NOUN       Lemma: блок            Dep: nsubj     
Token: е

* 66 _новият бял блок_ and 68 _Високия бял блок_ - the pattern ADJ/ADJ/NOUN points to Rule5.
* 72 and 73 depend on the order of words. We don't know who is the doer of the action in those sentences. Let's postpone fixing them for now.

In [None]:
idx_failed = [72, 73]
idx_passed = sorted(set(idx_passed + p))
idx_passed = sorted(list(set(idx_passed) - set(idx_failed)))
idx_current = sorted(list(set(idx_current) - set(p)))
idx_current = sorted(list(set(idx_current) - set(idx_failed)))

#### 6.6.4 Regression Test

In [None]:
_ = test_definite_article_all(test_set_1.loc[idx_passed], conditions, rules, print_passed=False)

✅ All tests passed


### 6.7 Iteration 5

#### 6.7.1 Implement Rule5

Rule5 says that when there are two or more adjectives in front of the noun, only the first adjective takes the definite article, which could be short or long. In the example with rows 66 and 68 which were analyzed in the previous section, we see that the two phrases _новият бял блок_ and _високия бял блок_ which are identical from grammatical standpoint, cannot be distinguished just by looking at their part of speech. We need to also analyze their relation to the entire sentence. The difference here is that in the first case Stanza correctly determined the noun _блок_ as an indirect object, therefore requiring the short form of the definite article, and the same word in the second sentence as nominal subject, requiring full definite article.

<img src="images/rule7_pattern_new.png" width="650">

Although we expressed the condition verbally as one rule, we need to create two sets of functions for the cases of short and long definite article.

In [None]:
# The adjectives, as well as the pronouns, the ordinal numerals etc., used as attributes in the sentence
# are usually placed in front of the nouns they qualify. In this case, the definite article, if needed,
# is joined to the _first_ attribute of the noun phrase. [4],[5]
def cond_rule5_helper(row_dict, i):
    """
    POS: ADJ / ADJ / NOUN iobj,  all masc. singular -> first ADJ must have def. art. short form
    - or -
    POS: ADJ / ADJ / NOUN nsubj, all masc. singular -> first ADJ must have def. art. long form
    """
    if i >= row_dict["num_tokens"] - 2:
        return False

    is_token1_masc_sg = is_masculine_singular(row_dict, i)
    is_token1_adj = nlp_get_pos(row_dict, i) == "ADJ"

    is_token2_masc_sg = is_masculine_singular(row_dict, i + 1)
    is_token2_adj = nlp_get_pos(row_dict, i + 1) == "ADJ"

    is_token3_masc_sg = is_masculine_singular(row_dict, i + 2)
    is_token3_noun = nlp_get_pos(row_dict, i + 2) == "NOUN"

    if is_token1_masc_sg and is_token2_masc_sg and is_token3_masc_sg:
        if is_token1_adj and is_token2_adj and is_token3_noun:
            return True

    return False

In [None]:
def cond_rule5_short(row_dict, i):
    if cond_rule5_helper(row_dict, i):
        noun_dep = nlp_get_dep(row_dict, i + 2)
        if noun_dep in ("iobj"):
            return True

    return False

In [None]:
def cond_rule5_long(row_dict, i):
    if cond_rule5_helper(row_dict, i):
        noun_dep = nlp_get_dep(row_dict, i + 2)
        if noun_dep in ("nsubj"):
            return True

    return False

In [None]:
def rule5_short(token1, token2, token_idx):
    errors = []

    # Аз живея в новият бял блок.
    if not token1.endswith(("я")):  # първото прилагателно в групата трябва да е с непълен член
        errors.append(token1)

    return errors

In [None]:
def rule5_long(token1, token2, token_idx):
    errors = []

    # Високия бял блок е нов.
    if not token1.endswith(("ят")):  # първото прилагателно в групата трябва да е с пълен член
        errors.append(token1)

    return errors

#### 6.7.2 Test Rule5

In [None]:
conditions = [cond_rule1, cond_rule2, cond_rule3, cond_rule4, cond_rule5_short, cond_rule5_long]
rules = [rule1, rule2, rule3, rule4, rule5_short, rule5_long]
p, c = test_definite_article_all(test_set_1.loc[idx_current], conditions, rules)

66 ✅ Аз живея в новият бял блок. ('новият' is incorrect)
68 ✅ Високия бял блок е нов. ('Високия' is incorrect)
✅ All tests passed


#### 6.7.3 Regression test

In [None]:
_ = test_definite_article_all(test_set_1.loc[idx_passed], conditions, rules, print_passed=False)

✅ All tests passed


#### 6.7.4 Final test with the whole set

In [None]:
_ = test_definite_article_all(test_set_1, conditions, rules)

0 ✅ Ключа е на масата. ('Ключа' is incorrect)
1 ✅ Ключът е на масата. (The sentence is correct)
2 ✅ Царя пие вино. ('Царя' is incorrect)
3 ✅ Царят пие вино. (The sentence is correct)
4 ✅ Ученика е умен и трудолюбив. ('Ученика' is incorrect)
5 ✅ Ученикът е умен и трудолюбив. (The sentence is correct)
6 ✅ Приятеля ми е в чужбина. ('Приятеля' is incorrect)
7 ✅ Приятелят ми е в чужбина. (The sentence is correct)
8 ✅ Госта пристигна. ('Госта' is incorrect)
9 ✅ Гостът пристигна. (The sentence is correct)
10 ❌ Готвача приготви обяда. (Actual: '', Expected: 'Готвача')
11 ✅ Готвачът приготви обяда. (The sentence is correct)
12 ✅ Влака спря на гара София. ('Влака' is incorrect)
13 ✅ Влакът спря на гара София. (The sentence is correct)
14 ✅ Учителя говореше за Стефан Стамболов. ('Учителя' is incorrect)
15 ✅ Учителят говореше за Стефан Стамболов. (The sentence is correct)
16 ❌ Таня търси лекарят. (Actual: '', Expected: 'лекарят')
17 ✅ Таня търси лекаря. (The sentence is correct)
18 ✅ Говорим за ле

We didn't predict correctly the usage of the definite article in 13 out of the 78 sentences. In some cases it is due to the NLP tag being incorrect, for example the case where the gender of the noun was tagged wrongly by Stanza. In other case it was because we still haven't implemented Rule7.

## 7 Test with real data

We've initially evaluated our rules using brief, uncomplicated sentences. These were intentionally kept short to clearly demonstrate each specific rule. However, real-world language usage typically involves lengthier and more intricate sentences. It's now necessary to apply our rules to authentic texts to assess their effectiveness.

We will work with the dataset [bulgarian-grammar-mistakes](https://huggingface.co/datasets/thebogko/bulgarian-grammar-mistakes) from huggingface. The data was originally collected from articles from Bulgarian Wikipedia as well as rows from OSCAR's Bulgarian datasets.

### 7.1 Load and prepare the dataset

In [None]:
grammar_errors = pd.read_csv("data/grammar_errors_original.csv")

In [None]:
grammar_errors.shape

(7587, 3)

In [None]:
grammar_errors.head(5)

Unnamed: 0,error_type,erroneous,correct
0,article_misuse,От какво беше направен входа на двора на скини...,От какво беше направен входът на двора на скин...
1,article_misuse,"Танева е предупредила, че документа ще се отра...","Танева е предупредила, че документът ще се отр..."
2,article_misuse,Патогенетичният механизъм на развитието на хип...,Патогенетичният механизъм на развитието на хип...
3,article_misuse,Президента и БСП осъдиха екстремизма и езика н...,Президентът и БСП осъдиха екстремизма и езика ...
4,article_misuse,Под дарението стоят името и подписа на просвет...,Под дарението стоят името и подписът на просве...


#### 7.1.1 Filter only errors related to article misuse:

In [None]:
grammar_errors.error_type.unique()

array(['article_misuse', 'pronoun_misuse', 'incorrect_verb_suffix_me',
       'noun_adjective_disagreement'], dtype=object)

In [None]:
grammar_errors = grammar_errors[grammar_errors["error_type"] == "article_misuse"]

In [None]:
grammar_errors.shape

(2349, 3)

#### 7.1.2 Rename column headings

In [None]:
grammar_errors = grammar_errors.rename(columns={"erroneous": "incorrect"})

#### 7.1.3 Add a column with differing words

In [None]:
grammar_errors[['incorrect', 'correct']] = grammar_errors[['incorrect', 'correct']].map(str.strip)
grammar_errors[["correct_words", "incorrect_words"]] = grammar_errors.apply(get_differing_words, axis=1, result_type="expand")

In [None]:
grammar_errors.head(3)

Unnamed: 0,error_type,incorrect,correct,correct_words,incorrect_words
0,article_misuse,От какво беше направен входа на двора на скини...,От какво беше направен входът на двора на скин...,входа,входът
1,article_misuse,"Танева е предупредила, че документа ще се отра...","Танева е предупредила, че документът ще се отр...",документа,документът
2,article_misuse,Патогенетичният механизъм на развитието на хип...,Патогенетичният механизъм на развитието на хип...,излишъка,излишъкът


#### 7.1.4 Melt the dataset and sort pairs of correct/incorrect sentences to be together

In [None]:
grammar_errors.shape

(2349, 5)

In [None]:
grammar_errors = melt_and_sort_sentences(grammar_errors)

In [None]:
grammar_errors.shape

(4698, 4)

In [None]:
grammar_errors.head(4)

Unnamed: 0,text,is_correct,incorrect_words,correct_words
0,От какво беше направен входа на двора на скини...,False,входа,входът
1,От какво беше направен входът на двора на скин...,True,,
2,"Танева е предупредила, че документа ще се отра...",False,документа,документът
3,"Танева е предупредила, че документът ще се отр...",True,,


#### 7.1.5 Add NLP tags

<div style="background-color:bisque">⚠️Note that calculating the features using Stanza takes around 1 hour on a laptop with average specs, therefore here we are loading a pre-saved file.

In [None]:
if False:
    feature_columns_st = ["pos", "tag", "dep", "morph", "lemmas", "left_edge", "right_edge", "num_tokens", "n_sents"]
    grammar_errors[feature_columns_st] = grammar_errors.apply(lambda r: extract_features(nlp_st, r, "text"), axis=1, result_type="expand")
    grammar_errors.to_csv("data/grammar_errors_with_nlp.csv", index=None)
else:
    # load from file instead of the above, it is faster
    grammar_errors = pd.read_csv("data/grammar_errors_with_nlp.csv")
    grammar_errors["correct_words"] = grammar_errors["correct_words"].fillna("")
    grammar_errors["incorrect_words"] = grammar_errors["incorrect_words"].fillna("")

#### 7.1.6 Filter only rows with 1 sentence, no quotes and other special characters

We need to filter out such texts since our rules can't deal with quoted text inside a sentence. Special characters also cause issues.

In [None]:
grammar_errors = grammar_errors[grammar_errors["n_sents"] == 1]

In [None]:
pattern = r"[\'\"‘’“”°/≈\:]"
grammar_errors = grammar_errors[~grammar_errors['text'].str.contains(pattern)]

In [None]:
grammar_errors.shape

(3605, 13)

### 7.2 Test with small subset of the real data

In [None]:
grammar_errors_10tokens = grammar_errors[grammar_errors["num_tokens"] < 10]

In [None]:
# execution will take 30 sec
passed, failed = test_definite_article_all(grammar_errors_10tokens, conditions, rules, print_passed=False, print_failed=True)

10 ❌ Сайта Уча. се е в тяхна помощ. (Actual: '', Expected: 'Сайта')
32 ❌ Бих искал и аз да дам своя принос. (Actual: '', Expected: 'своя')
50 ❌ Заема има предназначение за обзавеждане на дома. (Actual: '', Expected: 'Заема')
258 ❌ Рейс контрола пусна кола за сигурност. (Actual: 'Рейс', Expected: 'контрола')
259 ❌ Рейс контролът пусна кола за сигурност. (Actual: 'Рейс', Expected: '')
332 ❌ Сайта е разработен от Медия груп 24 ООД. (Actual: '', Expected: 'Сайта')
338 ❌ Да, това е Учителят. (Actual: 'Учителят', Expected: '')
339 ❌ Да, това е Учителят. (Actual: 'Учителят', Expected: '')
366 ❌ Триумвирата е мъртъв. (Actual: '', Expected: 'Триумвирата')
416 ❌ Пика на есен 2010 ме хвана в София. (Actual: '', Expected: 'Пика')
454 ❌ По какво се различава домата от трактора? (Actual: '', Expected: 'домата')
458 ❌ Сайта е написан с Notepad. (Actual: '', Expected: 'Сайта')
672 ❌ Нужен ли е сапуна за съществуването на човека? (Actual: '', Expected: 'сапуна')
702 ❌ Името на философа носи кратера Пла

Let's review the outcomes.

**To begin with**, the rules failed to detect errors in nearly a quarter (24%) of the tests.
However, upon closer inspection of the failures, a pattern emerges. Upon closer look, it is evident that most of the failed tests have even-numbered indices. This suggests that while correctly formed sentences were accurately identified, not all erroneous sentences were successfully detected.

Let's take a closer look at the instances where the tests failed for sentences that were initially correct:

In [None]:
grammar_errors.loc[[338, 339, 2022, 2023, 3396, 3397, 3942, 3943 ]][['text', 'is_correct', 'incorrect_words', 'correct_words']]

Unnamed: 0,text,is_correct,incorrect_words,correct_words
338,"Да, това е Учителят.",False,,
339,"Да, това е Учителят.",True,,
2022,Администраторът на форума така е решил.,False,Администраторът,Администратора
2023,Администратора на форума така е решил.,True,,
3396,Официалният език на форума е българският.,False,Официалният,Официалния
3397,Официалния език на форума е българският.,True,,
3942,Това е успокоителният хап за съвестта му ...,False,успокоителният,успокоителния
3943,Това е успокоителния хап за съвестта му ...,True,,


The sentence _Да, това е Учителят_ is listed as both correct and incorrect.

The sentence _Администратора на форума така е решил_ is listed as correct, but in fact it is not.

The sentence _Това е успокоителния хап за съвестта му_ is also listed as correct, but in fact it is incorrect.

Therefore, in at least these 3 examples, the rules correctly identified the error, but the expected result was wrong bcause the original data in the dataset was wrong.

The assumption about the original wrong data is, that the data was scraped from various sources and considered "correct", then errors were automatically introduced to produce the incorrect version. However, it's important to note that the source material itself may not always be grammatically accurate. For exmaple, the sentence _Това е успокоителния хап за съвестта му_ can be found [in a blog comment from 2010](https://petdoshkov.blog.bg/drugi/2010/05/10/vyzzivnoto-reshenie-za-nakazanieto-zabelejka.542411). This example demonstrates that even the original text contained grammatical errors, challenging the assumption that the initial data was entirely correct before deliberate mistakes were added.




**Another reason** of the many failures is that we still haven't implemented all rules listed in the Grammas section. Additionally, we tested on a small set of 39 pairs of simple sentences so we didn't verify our "model" with enough data. Another explanation is that in the NLP features we extracted there are two still unused ones - left edge and right edge.

### 7.3 Test with the whole dataset

In [None]:
grammar_errors.shape

(3605, 13)

<div style="background-color:Bisque">

⚠️The below test will take ~ 18 minutes. As an alternative to executing it you may take a look at the screenshot.

In [None]:
if False:
    start_time = time.time()

    passed, failed = test_definite_article_all(grammar_errors, conditions, rules, print_passed=False, print_failed=False)

    end_time = time.time()
    execution_time_st = end_time - start_time
    print(f"Execution time (stanza): {execution_time_st} seconds")

Screenshot of the result:

<img src="images/whole_dataset_result.png">

Testing with the whole dataset we see that one third of the tests failed. Certainly there is room for improvement!

## Summary and insights gained

Although the results of the final test were not as encouraging as hoped, the project still yielded valuable insights.

Initially, selecting the appropriate NLP library was a challenging task. While UDPipe offered good performance, it lacked accuracy, and although Stanza delivered better results, it was much slower. This experience highlighted the importance of carefully balancing accuracy and performance when choosing tools for linguistic analysis.

Another key realization was the unexpected complexity of Bulgarian grammar, particularly the rules surrounding the use of the definite article. This complexity necessitated a deeper investigation into the language's linguistic structures.

Parsing text to detect incorrect definite articles also proved to be more complicated than expected. The development of effective rules for this task was hindered by the diversity of sentence structures encountered.

Additionally, the testing dataset posed its own challenges, as it contained inaccuracies that affected the validation process. This emphasized the crucial need for high-quality, accurate datasets.

Despite these obstacles, the implementation of rule-based methods produced promising results, especially in identifying errors in shorter sentences. This success demonstrated that even in the face of linguistic complexity, well-designed rules are essential for achieving accurate outcomes.

The insights gained from this effort will undoubtedly contribute to the broader field of computational linguistics and inspire more accurate and efficient solutions for grammatical analysis in Bulgarian. As we continue to refine our approach and expand our rule set, we are confident in our ability to develop a robust and reliable tool for checking the correctness of definite articles in Bulgarian texts.

**References**

<div id="ref1">[1] Astoria Academy, <a href="https://astoria-academy.com/the-definite-articles-of-bulgarian/">The Definite articles of Bulgarian,</a> 2023</div>

<div id="ref2">[2] CoLanguage <a href="https://www.colanguage.com/definite-article-bulgarian-nouns">Definite article of the Bulgarian nouns</a></div>

<div id="ref3">[3] в. „Аз Буки“ бр. 16 <a href="https://ibl.bas.bg/ezikovi_spravki/otnovo-za-palniya-i-kratkiya-tchlen/">Отново за пълния и краткия член</a></div>

<div id="ref4">[4] Raquel Jacob <a href="https://help.unbabel.com/hc/en-us/articles/360022878854-Language-Guidelines-Bulgarian">Language Guidelines – Bulgarian</a></div>

<div id="ref5">[5] Andonova, Sabeva, Zagorova <a href="https://caritas.bg/cms/wp-content/uploads/2015/04/A1-English.pdf?x10535">Bulgarian for Refugees,</a> 2014</div>

<div id="ref6">[6] John Leafgren <a href="http://www.seelrc.org:8080/grammar/pdf/stand_alone_bulgarian.pdf">A Concise Bulgarian
Grammar</a></div>

<div id="ref7">[7] G. Popova <a href="https://www.english-linguistics.de/archives/clark/SIMOV/CM/popova.pdf">Towards an HPSG Account of the
Bulgarian Definite Article</a></div>

<div id="ref8">[8] K. Bontcheva <a href="https://theswissbay.ch/pdf/Books/Linguistics/Mega%20linguistics%20pack/Indo-European/Balto-Slavic/Bulgarian%20Grammar%2C%20Elementary%20%28Bontcheva%29.pdf">Bulgarian Language - Grammar</a></div>