In [70]:
!ls -al

total 14048
drwxr-xr-x 1 root root     4096 Jan 19 07:22 .
drwxr-xr-x 1 root root     4096 Jan 19 07:17 ..
-rw-r--r-- 1 root root 14360705 Jan 19 08:00 bulgarian-btb-ud-2.5-191206.udpipe
drwxr-xr-x 4 root root     4096 Jan 16 14:29 .config
drwxr-xr-x 1 root root     4096 Jan 16 14:29 sample_data
drwxr-xr-x 3 root root     4096 Jan 19 07:22 stanza_resources


In [71]:
!pip install ufal.udpipe



In [72]:
from spacy.tokens import Doc
from ufal.udpipe import Model, Pipeline

import os
import pandas as pd
import re
import spacy
import spacy_stanza
import stanza
import time
import urllib.request
import warnings

In [73]:
# Suppress specific warnings
warnings.filterwarnings("ignore", category=FutureWarning, message=".*torch.load.*")

In [74]:
USE_STANZA=False

# Linguistic Rule-Based System for Definite Article Verification in Bulgarian Language

## 3 Experiments with Stanza and UDPipe

### 3.1 Download and initialize Bulgarian models

In [75]:
# Download the UDPipe Bulgarian model
model_url_udpipe = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3131/bulgarian-btb-ud-2.5-191206.udpipe?sequence=6&isAllowed=y"
model_filename_udpipe = "bulgarian-btb-ud-2.5-191206.udpipe"
urllib.request.urlretrieve(model_url_udpipe, model_filename_udpipe)

('bulgarian-btb-ud-2.5-191206.udpipe',
 <http.client.HTTPMessage at 0x7d29716a7f50>)

In [76]:
# Load the UDPipe nlp pipeline
model_udpipe = Model.load(model_filename_udpipe)
pipeline_udpipe = Pipeline(model_udpipe, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu")
nlp_spacy_udpipe = spacy.blank("bg")



### 3.2 Define initial functions

We need some functions for creating the NLP objects for Stanza and UDPipe.

In [77]:
def extract_gender_from_tag(tag):
    gender = " "
    # gender is applicable to common noun, proper noun or adjective
    if tag.startswith(("Nc", "Np", "A")):
        if "m" in tag:
            gender = "Masculine"
        elif "f" in tag:
            gender = "Feminine"
        elif "n" in tag:
            gender = "Neutral"
    return gender

# TODO: need to exclude triple character patterns like "p1s": "Past tense, 1st person sng",
def extract_number_from_tag(tag):
    number = None
    if "s" in tag:
        number = "Singular"
    elif "p" in tag:
        number = "Plural"
    return number

def is_masculine(row_dict, i):
    tag = row_dict["tag"].split(",")[i]
    return extract_gender_from_tag(tag) == "Masculine"

def is_singular(row_dict, i):
    tag = row_dict["tag"].split(",")[i]
    return extract_number_from_tag(tag) == "Singular"

def is_masculine_singular(row_dict, i):
    return is_masculine(row_dict, i) and is_singular(row_dict, i)

def is_dep_subject(row_dict, i):
    dep = row_dict["dep"].split(",")[i]
    return dep in {"nsubj", "csubj", "nsubj:pass", "csubj:pass"}

def nlp_get_pos(row_dict, i):
    return row_dict["pos"].split(",")[i]

def nlp_get_dep(row_dict, i):
    return row_dict["dep"].split(",")[i]

def nlp_get_lemma(row_dict, i):
    return row_dict["lemmas"].split(",")[i]

def nlp_get_article(row_dict, i):
    morph = row_dict["morph"].split(",")[i]
    if "Definite=Def" in morph:
        return "definite"
    elif "Definite=Ind" in morph:
        return "indefinite"
    else:
        return ""

In [78]:
# for quick viewing the features
# use updpipe object for the extraction of the word because of performance
# use stanza object for determining features because of accuracy
def inspect_word_prev(row_dict, word_num):
    word = nlp_ud(row_dict["text"])[word_num].text
    pos = row_dict["pos"].split(",")[word_num]
    tag = row_dict["tag"].split(",")[word_num]
    dep = row_dict["dep"].split(",")[word_num]

    gen_s = extract_gender_from_tag(tag)[0:1]  # M - Masculine, F - Feminine, N - Neutral
    gen_s = extract_gender_from_tag(tag)[0:1] if extract_gender_from_tag(tag) else " "
    # num_s = extract_number_from_tag(tag)[0:1]
    num_s = extract_number_from_tag(tag)[0:1] if extract_number_from_tag(tag) else ""

    # return f"{row_dict['index']} {word:<15} POS:{pos:<{10}} Gen:{gen_s}    Num:{num_s}   DEP:{dep:<{10}} Sent: {row_dict["text"]:<30}"
    return f"{row_dict['index']} {word:<15} POS:{pos:<{10}} Gen:{gen_s}    Num:{num_s}   DEP:{dep:<{10}} Sent: {row_dict['text'][:30]}"

In [79]:
# for quick viewing the features
# use updpipe object for the extraction of the word because of performance
# use stanza object for determining features because of accuracy
def inspect_word(row_dict, word_num):
    word = nlp_ud(row_dict["text"])[word_num].text
    pos = row_dict["pos"].split(",")[word_num]
    tag = row_dict["tag"].split(",")[word_num]
    dep = row_dict["dep"].split(",")[word_num]

    gen_s = extract_gender_from_tag(tag)[0:1]  # M - Masculine, F - Feminine, N - Neutral
    gen_s = extract_gender_from_tag(tag)[0:1] if extract_gender_from_tag(tag) else " "
    # num_s = extract_number_from_tag(tag)[0:1]
    num_s = extract_number_from_tag(tag)[0:1] if extract_number_from_tag(tag) else ""

    # return f"{row_dict['index']} {word:<15} POS:{pos:<{10}} Gen:{gen_s}    Num:{num_s}   DEP:{dep:<{10}} Sent: {row_dict["text"]:<30}"
    return f"{row_dict['index']} {word:<15} POS:{pos:<{10}} Gen:{gen_s}    Num:{num_s}   DEP:{dep:<{10}} Sent: {row_dict['text'][:30]}"

In [80]:
def row_to_dict(df, index):
    row_dict = df.loc[index].to_dict()
    row_dict["index"] = index
    return row_dict

In [81]:
def udpipe_to_spacy(text, nlp):
    """Parses a CoNLL-U formatted string into a spaCy Doc object."""
    doc_str = pipeline_udpipe.process(text)
    lines = doc_str.strip().splitlines()
    words = []
    lemmas = []
    spaces = []
    pos_tags = []
    morph_tags = []
    dep_heads = []
    dep_rels = []

    for line in lines:
        if line.startswith("#") or not line.strip():
            continue

        parts = line.split("\t")
        index, word, lemma, pos, tag, feats, head, dep_rel, _, misc = parts

        words.append(word)
        lemmas.append(lemma)
        pos_tags.append(pos)  # POS tag (simpler POS category)
        morph_tags.append(tag)  # Detailed morphological tag
        dep_heads.append(int(head) - 1)  # Convert to zero-indexed
        dep_rels.append(dep_rel)

        if "SpaceAfter=No" in misc:
            spaces.append(False)
        else:
            spaces.append(True)

    # Create the Doc object
    doc = Doc(nlp.vocab, words=words, spaces=spaces)

    # Set the POS tags, morphological tags, lemmas, and dependency parsing information
    for token, lemma, pos, morph_tag, head, dep_rel in zip(doc, lemmas, pos_tags, morph_tags, dep_heads, dep_rels):
        token.lemma_ = lemma
        token.pos_ = pos  # Simple POS tag
        token.tag_ = morph_tag  # Detailed morphological tag
        token.head = doc[head] if head >= 0 else token
        token.dep_ = dep_rel

    return doc

In [82]:
def nlp_ud(txt):
    return udpipe_to_spacy(txt, nlp_spacy_udpipe)

### 3.3 Compare NLP features

In [83]:
# A helper function for displaying useful NLP features in easy to read format
def inspect_spacy_doc(doc):
    for token in doc:
        print(f"Token: {token.text:<15} Tag: {token.tag_:<15} POS: {token.pos_:<10} Lemma: {token.lemma_:<15} Dep: {token.dep_:<10}")
    print("\n")

In [84]:
sentence = """Сред гостите на официалната церемония по встъпване в длъжност на новоизбрания президент ще бъдат българският \
държавен глава Румен Радев, президентът на Албания Илир Мета и на Косово Хашим Тачи."""
doc_ud = nlp_ud(sentence)

print("\nNLP features from UDPipe:")
inspect_spacy_doc(doc_ud)


NLP features from UDPipe:
Token: Сред            Tag: R               POS: ADP        Lemma: сред            Dep: case      
Token: гостите         Tag: Ncmpd           POS: NOUN       Lemma: гост            Dep: nmod      
Token: на              Tag: R               POS: ADP        Lemma: на              Dep: case      
Token: официалната     Tag: Afsd            POS: ADJ        Lemma: официален       Dep: amod      
Token: церемония       Tag: Ncfsi           POS: NOUN       Lemma: церемония       Dep: nmod      
Token: по              Tag: R               POS: ADP        Lemma: по              Dep: case      
Token: встъпване       Tag: Ncnsi           POS: NOUN       Lemma: встъпване       Dep: nmod      
Token: в               Tag: R               POS: ADP        Lemma: в               Dep: case      
Token: длъжност        Tag: Ncfsi           POS: NOUN       Lemma: длъжност        Dep: nmod      
Token: на              Tag: R               POS: ADP        Lemma: на             

### 3.5 Compare performance with large amount of data

In [87]:
# Create a DataFrame with 10 rows, use same sentence for simplicity
test_df = pd.DataFrame({"text": [sentence] * 10})

In [95]:
def sent_features_to_string(sent):
    """Converts sentence features to strings"""
    pos = ",".join([tk.pos_ for tk in sent])
    tag = ",".join([tk.tag_ for tk in sent])
    dep = ",".join([tk.dep_ for tk in sent])
    morph = ",".join([str(tk.morph) for tk in sent])
    lemmas = ",".join([str(tk.lemma_) for tk in sent])
    n_tokens = len(sent)

    return pos, tag, dep, morph, lemmas,n_tokens

def udpipe_sentence_to_features(nlp, row, col):
    """Extracts linguistic features from a text column using a given NLP model"""
    value = row[col]
    doc = nlp(value)
    return sent_features_to_string(doc)

#### 3.5.2 Extracting features using UDPipe

In [96]:
test_df[["pos", "tag", "dep", "morph", "lemmas", "num_words"]] = test_df.apply(
    lambda r: udpipe_sentence_to_features(nlp_ud, r, "text"), axis=1, result_type="expand"
)

#### 3.5.3 Comparison of the performance

In [90]:
test_df.columns

Index(['text', 'pos', 'tag', 'dep', 'morph', 'lemmas', 'num_words'], dtype='object')

In [91]:
test_df.loc[0, 'lemmas']

'сред,гост,на,официален,церемония,по,встъпване,в,длъжност,на,новоизбрам,президент,ще,бъда,български,държавен,глава,румен,радев,,,президент,на,албания,Илир,Мета,и,на,косово,Хашим,Тача,.'

In [94]:
assert test_df.loc[0, 'lemmas'] == 'сред,гост,на,официален,церемония,по,встъпване,в,длъжност,на,новоизбрам,президент,ще,бъда,български,държавен,глава,румен,радев,,,президент,на,албания,Илир,Мета,и,на,косово,Хашим,Тача,.'

In [68]:
test_df.head(2)

Unnamed: 0,text,pos,tag,dep,morph,lemmas,num_words
0,Сред гостите на официалната церемония по встъп...,"ADP,NOUN,ADP,ADJ,NOUN,ADP,NOUN,ADP,NOUN,ADP,AD...","R,Ncmpd,R,Afsd,Ncfsi,R,Ncnsi,R,Ncfsi,R,Amsh,Nc...","case,nmod,case,amod,nmod,case,nmod,case,nmod,c...",",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,","сред,гост,на,официален,церемония,по,встъпване,...",31
1,Сред гостите на официалната церемония по встъп...,"ADP,NOUN,ADP,ADJ,NOUN,ADP,NOUN,ADP,NOUN,ADP,AD...","R,Ncmpd,R,Afsd,Ncfsi,R,Ncnsi,R,Ncfsi,R,Amsh,Nc...","case,nmod,case,amod,nmod,case,nmod,case,nmod,c...",",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,","сред,гост,на,официален,церемония,по,встъпване,...",31


In [69]:
res = []
res.append(inspect_word(row_to_dict(test_df, 1), 0))
res.append(inspect_word(row_to_dict(test_df, 1), 1))
res.append(inspect_word(row_to_dict(test_df, 2), 2))
res.append(inspect_word(row_to_dict(test_df, 2), 3))
res.append(inspect_word(row_to_dict(test_df, 2), 4))
print("\n".join(res))


1 Сред            POS:ADP        Gen:     Num:   DEP:case       Sent: Сред гостите на официалната це
1 гостите         POS:NOUN       Gen:M    Num:P   DEP:nmod       Sent: Сред гостите на официалната це
2 на              POS:ADP        Gen:     Num:   DEP:case       Sent: Сред гостите на официалната це
2 официалната     POS:ADJ        Gen:F    Num:S   DEP:amod       Sent: Сред гостите на официалната це
2 церемония       POS:NOUN       Gen:F    Num:S   DEP:nmod       Sent: Сред гостите на официалната це


In [62]:
DO TUK

SyntaxError: invalid syntax (<ipython-input-62-7d37760df8dc>, line 1)

The output indicates that Stanza took much longer to execute compared to UDPipe for the same task of extracting NLP features from 10 sentences.

#### 3.5.3 Final verdict

Stanza in much better at splitting into sentences and supports more features, whereas UDPipe has significantly better performance. So the next steps will be performed mainly with Stanza in order to get more accurate results but let's be open to the possibility of using UDPipe in certain situations.

## 4 Proof of concept

Let's create a minimal program as a proof of concept. The goal is to illustrate the idea behind text processing and determining the correctness of the definite article.

### 4.1 Define one rule

Define one simple rule and put it in a testing function. The rule is: if the word is a noun, is preceded by a preposition, and ends in _ът_ or _ят_, then the long form of the definite article is incorrect. Note that in reality this rule is not enough to determine the correct usage of the definite article but for the purposes of the POC it will suffice.

In [None]:
def verify_definite_article(doc):
    """Detects incorrect usage of the definite article based on POS tags and word endings"""
    errors = []

    for i, token in enumerate(doc):
        # Check if the token is a noun with a definite article (full form)
        if token.pos_ == "NOUN" and token.text.endswith(("ът", "ят")):
            # Check if the preceding token is a preposition (ADP), like "в"
            if i > 0 and doc[i - 1].pos_ == "ADP":
                errors.append(f"Incorrect usage of full definite article: '{token.text}' in sentence: '{doc.text}'")
    return errors

### 4.2 Test and validate

In [None]:
valid_sentences = ["Аз отивам в офиса.", "Иван отива в офиса.", "Офисът е в София."]
invalid_sentences = ["Аз отивам в офисът.", "Иван отива в офисът.", "Офиса е в София."]

In [None]:
# Process each sentence with both libraries
results_st = []
results_ud = []
for sentence in valid_sentences + invalid_sentences:
    doc_st = nlp_st(sentence)
    doc_ud = nlp_ud(sentence)

    results_st.append(verify_definite_article(doc_st))
    results_ud.append(verify_definite_article(doc_ud))

In [None]:
print("Results from Stanza:")
for i, res in enumerate(results_st):
    print(f"{i+1}:", res)

print("\nResults from UDPipe:")
for i, res in enumerate(results_ud):
    print(f"{i+1}:", res)

Of course we can't expect much correctness of this simple rule but we can see that it did find some incorrect usages in two out of the six cases. Not bad, actually!

## 5 Load and clean the testing data

This short set of sentences contains some common misuses of the definite article and also the correct usage. We will use this set for initial testing of the rules we develop.

In [None]:
test_set_1 = pd.read_csv("https://raw.githubusercontent.com/MirkaIvanova/datasets/refs/heads/main/data_science_project/test_set_1.csv")

In [None]:
test_set_1.head()

### 5.1 Trim the sentences of whitespaces

In [None]:
test_set_1[['incorrect', 'correct']] = test_set_1[['incorrect', 'correct']].map(str.strip)

### 5.2 Add column with the differing words

In order to perform automated tests we will need to know were exactly is the error in the sentence, if there is an error. Therefore, we'll add a new column with the expected incorrect words. Before that we need to do some cleaning like removal of punctuation. Note that this function is designed to work with Bulgarian in order to keep words 🟠

In [None]:
def clean_sentence_bg(sentence):
    """Cleans a Bulgarian sentence by removing unwanted punctuation but preserving valid dashes."""
    # Remove dashes not surrounded by exactly two bg letters
    # keep also numbers in order to preserve words like 5-годишен
    sentence = re.sub(r"(?<![0-9а-яА-Я])-|-(?![а-яА-Я])", "", sentence)

    # Remove all other punctuation except for valid dashes
    sentence = re.sub(r"[^\w\s-]", "", sentence)

    return sentence

In [None]:
def get_differing_words(row):
    """Extracts differing words between two sentences and logs word count discrepancies."""
    # Clean and split the sentences into words
    incorrect_words = clean_sentence_bg(row["incorrect"]).split()
    correct_words = clean_sentence_bg(row["correct"]).split()

    # Log if word count differs
    if len(incorrect_words) != len(correct_words):
        print(f"Word count differs: Incorrect - {len(incorrect_words)} words, Correct - {len(correct_words)} words")

    # Get words from the correct sentence that are not in the incorrect sentence
    differing_correct_words = [word for word in correct_words if word not in incorrect_words]
    differing_incorrect_words = [word for word in incorrect_words if word not in correct_words]

    return ",".join(differing_incorrect_words), ",".join(differing_correct_words)

In [None]:
# add a new column to hold the correct and incorrect words, they will be used for automated testing
test_set_1[["correct_words", "incorrect_words"]] = test_set_1.apply(get_differing_words, axis=1, result_type="expand")

In [None]:
test_set_1.head(3)

### 5.3 Melt and sort

Merge the columns _incorrect_ and _correct_ to a single column _text_. Keep each pair of incorrect and correct sentences together.

In [None]:
def melt_and_sort_sentences(df):
    """Transforms a DataFrame to melt sentence pairs, ensuring incorrect and correct sentences are paired."""

    # Swap 'incorrect_words' and 'correct_words' columns
    df = df.rename(columns={"incorrect_words": "temp_correct_words", "correct_words": "incorrect_words"})
    df = df.rename(columns={"temp_correct_words": "correct_words"})

    # Add an order column to keep track of the original order
    df["order"] = df.index

    # Melt the DataFrame
    df_melted = pd.melt(
        df,
        id_vars=["correct_words", "incorrect_words", "order"],
        value_vars=["incorrect", "correct"],
        var_name="is_correct",
        value_name="text",
    )

    # Set 'is_correct' to True for 'correct' and False for 'incorrect'
    df_melted["is_correct"] = df_melted["is_correct"].apply(lambda x: x == "correct")

    # Assign incorrect_words and correct_words only to incorrect sentences
    df_melted["incorrect_words"] = df_melted.apply(lambda row: row["incorrect_words"] if not row["is_correct"] else "", axis=1)
    df_melted["correct_words"] = df_melted.apply(lambda row: row["correct_words"] if not row["is_correct"] else "", axis=1)

    # Sort by 'order' to ensure the pairs of correct/incorrect sentences are next to each other
    df_melted = df_melted.sort_values(by=["order", "is_correct"], ascending=[True, True])

    # Reset index for clean output
    df_melted.reset_index(drop=True, inplace=True)

    # Drop the 'order' column as it's no longer needed
    df_melted = df_melted.drop(columns=["order"])

    # Reorder columns to match the desired output
    df_melted = df_melted[["text", "is_correct", "incorrect_words", "correct_words"]]

    return df_melted

In [None]:
test_set_1 = melt_and_sort_sentences(test_set_1)

In [None]:
test_set_1.head(6)

### 5.4 Add columns with Stanza NLP features

In [None]:
if True:
    # extract stanza features as new columns
    feature_columns_st = ["pos", "tag", "dep", "morph", "lemmas", "left_edge", "right_edge", "num_tokens"]
    test_set_1[feature_columns_st] = test_set_1.apply(lambda r: extract_features(nlp_st, r, "text"), axis=1, result_type="expand")

    # save in case we want to load it faster
    if not os.path.exists("data"):
        os.makedirs("data")
    test_set_1.to_csv("data/test_set_1_clean.csv", index=None)
else:
    # load from file instead of the above, it is faster
    test_set_1 = pd.read_csv("data/test_set_1_clean.csv")
    test_set_1["correct_words"] = test_set_1["incorrect_words"].fillna("")
    test_set_1["incorrect_words"] = test_set_1["incorrect_words"].fillna("")

In [None]:
test_set_1.columns

In [None]:
test_set_1[["text", "num_tokens", "pos", "tag", "dep", "morph", "lemmas", "left_edge", "right_edge"]].head(1).T

## 6 Define rule based logic

### 6.1 Implement NLP specific functions

This section contains functions that extract grammatical details like gender, number, and sentence role from tagged words.

### 6.2 Implement generic testing function

Before we start implementing any rules, we should first create a testing function for them. Since we don’t have the specific rule functions ready, we’ll set up a general testing function - actually, we’ll create three functions - that can take in a list of conditions and a list of rules. It’s important that both lists have the same length. The testing function will process text, apply these conditions and rules to detect errors, and then compare the results with expected outcomes, logging whether each test passes or fails.

In [None]:
def all_rules(row_dict, condition_fns, rule_fns):
    # ⚠️Make sure the sentence always ends with a punctuation. This is a workaround!
    # UDPipe not always separates into tokens correctly, sometimes reports the word+punctuation (or special symbol) as single token.
    # Stanza has bad performance and this function is called iteratively multiple times, not a good idea to use Stanza here.
    sentence = row_dict["text"] + "."
    doc = nlp_ud(sentence)
    errors = []

    # ⚠️This is a very ugly workaround. UDPipe does not correctly split into tokens when there is punctuation
    # in the middle of the sentence. We return "no errors" here which might make some tests fail but
    # the goal is not to raise an exception. The correct solution is not to use UDPipe at all and pre-calculate the tokens
    # in the beginning.
    if (len(doc) != row_dict['num_tokens']):
        return errors

    # Iterate over token pairs with indices
    # for i in range(len(doc) - 1):
    for i in range(row_dict['num_tokens'] - 1):
        token1 = doc[i].text
        token2 = doc[i + 1].text

        conditions = [condition_fn(row_dict, i) for condition_fn in condition_fns]

        for condition, rule_fn in zip(conditions, rule_fns):
            if condition:
                errors += rule_fn(token1, token2, i)

    return errors

In [None]:
def test_definite_article(row_dict, condition_fns, rule_fns):
    actual_wrong_words = []
    expected_wrong_words = [word.strip() for word in row_dict["incorrect_words"].split(",") if word.strip() != ""]
    actual_wrong_words = all_rules(row_dict, condition_fns, rule_fns)

    test_res = "Pass" if set(actual_wrong_words) == set(expected_wrong_words) else "Fail"

    return (test_res, actual_wrong_words)

In [None]:
# tests all sentences from a dataset that already contains NLP tags
def test_definite_article_all(nlp_df, conditions_fns, rules_fns, print_passed=True, print_failed=True, print_total=True):
    """Tests definite articles in sentences and logs results."""
    idx_failed, idx_passed = [], []

    for i, row in nlp_df.iterrows():
        row_dict = row.to_dict()
        res = test_definite_article(row_dict, conditions_fns, rules_fns)
        status, incorrect = res[0], res[1]

        if status == "Fail":
            idx_failed.append(i)
            if print_failed:
                print(i, f'❌ {row_dict["text"]} (Actual: \'{",".join(incorrect)}\', Expected: \'{row_dict["incorrect_words"]}\')')
        else:
            idx_passed.append(i)
            message = f'✅ {row_dict["text"]} (\'{",".join(incorrect)}\' is incorrect)' if incorrect else f'✅ {row_dict["text"]} (The sentence is correct)'
            if print_passed:
                print(i, message)

    if print_total:
        n_failed = len(idx_failed)
        print("✅ All tests passed" if n_failed == 0 else f"❌ {n_failed}/{len(nlp_df)} failed.")

    return idx_passed, idx_failed

### 6.3 Iteration 1

#### 6.3.1 Implement Rule1

According to Rule1, long definite article should be used when the noun takes the role of the subject. Additionally, in order to exclude groups of adjective + noun, which fall under Rule3, we need to exclude noun that follow an adjective.

In [None]:
def cond_rule1(row_dict, i):
    """Searches for a NOUN, masculine, singular, which is the SUBJECT (nominal or clausal) in the sentence,
       and is not preceded by an ADJective"""
    is_token1_adj = nlp_get_pos(row_dict, i) == "ADJ"
    is_token1_noun = nlp_get_pos(row_dict, i) == "NOUN"
    is_token1_masc_sg = is_masculine_singular(row_dict, i)
    is_token1_subj = is_dep_subject(row_dict, i)

    is_token2_noun = nlp_get_pos(row_dict, i + 1) == "NOUN"
    is_token2_masc_sg = is_masculine_singular(row_dict, i + 1)
    is_token2_subj = is_dep_subject(row_dict, i + 1)

    # if token is masculine singular noun and subj, and the previous token is not an adjective
    if i == 0 and is_token1_masc_sg and is_token1_noun and is_token1_subj:
        return True
    if i != 0 and not is_token1_adj and is_token2_masc_sg and is_token2_noun and is_token2_subj:
        return True

    return False

In [None]:
def rule1(token1, token2, token_idx):
    errors = []
    word = token1 if token_idx == 0 else token2
    if not word.endswith(("ът", "ят")):
        errors.append(word)
    return errors

#### 6.3.2 Test Rule1

In [None]:
idx_passed, idx_current = test_definite_article_all(test_set_1, [cond_rule1], [rule1])

##### Analyze some of the failures

Now let's taкe the first several failed tests and figure out why they failed.

In [None]:
def row_to_dict(df, index):
    row_dict = df.loc[index].to_dict()
    row_dict["index"] = index
    return row_dict

In [None]:
res = []
res.append(inspect_word(row_to_dict(test_set_1, 10), 0))  # 👉Готвача приготви обяда.
res.append(inspect_word(row_to_dict(test_set_1, 16), 2))  # Таня търси 👉лекарят.
res.append(inspect_word(row_to_dict(test_set_1, 18), 2))  # Говорим за 👉лекарят.
res.append(inspect_word(row_to_dict(test_set_1, 20), 5))  # Таня търси Ангел Иванчев, 👉лекарят.
res.append(inspect_word(row_to_dict(test_set_1, 22), 5))  # Говорим за Ангел Иванчев, 👉лекарят.
res.append(inspect_word(row_to_dict(test_set_1, 24), 3))  # Той ми даде 👉ключът от къщата.
print("\n".join(res))

The problem with _**готвача**_ is that it is considered to be feminine by Stanza. For now we **will postpone fixing** because without context it is impossible to determine the gender, and without gender we can't apply Rule1.

The word _**лекарят**_ in _Таня търси лекарят_ is determined as an nsubj (nominal subject) by Stanza but the correct dependency is object. Well, grammatically it would be correct if we assume that the doctor is the doer in the sentence. However, that is the less popular word order. Let's **postpone fixing** this sentence for later.

The word _**Лекарят**_ in the sentences \
_Говорим за лекарят_, \
_Таня търси Ангел Иванчев, лекарят_ \
and _Говорим за Ангел Иванчев, лекарят._ \
is determined to be iobj (indirect object), nmod (nominal modifier) and conj (conjust) so that falls under Rule2.

_**ключът**_ is marked as object so it also falls under Rule2.

So the last 4 failures are expected to be fixed after we implement Rule2.

Separate the failed and analyzed tests into one list and the passed tests into another list. The one with the passed tests will be used for regression testing.

In [None]:
idx_passed = sorted(list(set(idx_passed) - set([10, 16])))
idx_current = sorted(list(set(idx_current) - set([10, 16])))

### 6.4 Iteration 2

#### 6.4.1 Implement Rule2

Here we will implement the first part of Rule2, which states that when the noun takes the role of the object, then the short form of the definite article should be used.

In [None]:
def cond_rule2(row_dict, i):
    """Searches for masculine singular nouns that are the object in the sentence"""
    is_token1_noun = nlp_get_pos(row_dict, i) == "NOUN"
    is_token1_masc_sg = is_masculine_singular(row_dict, i)
    is_token1_subj = is_dep_subject(row_dict, i)

    if is_token1_noun and is_token1_masc_sg and not is_token1_subj:
        return True
    return False

In [None]:
def rule2(token1, token2, idx_token):
    errors = []
    if token1.endswith(("ът", "ят")):  # objects should not have full article
        errors.append(token1)
    return errors

#### 6.4.2 Test Rule2

In [None]:
p, c = test_definite_article_all(test_set_1.loc[idx_current], [cond_rule1, cond_rule2], [rule1, rule2])

In the previous run rows 18, 20, 22 and 24 failed, now after implementing Rule2 they pass. Additionally, 4 more tests pass.

##### Analyze some of the failures

In [None]:
res = []
res.append(inspect_word(row_to_dict(test_set_1, 26), 0))  # 👉Принца говори с царят.
res.append(inspect_word(row_to_dict(test_set_1, 34), 0))  # 👉Дългоочаквания гост дойде.
res.append(inspect_word(row_to_dict(test_set_1, 34), 1))  # Дългоочаквания 👉гост дойде.
res.append(inspect_word(row_to_dict(test_set_1, 36), 2))  # Иван, 👉госта от Сопот, дойде.
res.append(inspect_word(row_to_dict(test_set_1, 38), 2))  # Иван е 👉съученика ми.
res.append(inspect_word(row_to_dict(test_set_1, 40), 2))  # Иван е 👉добрия.
res.append(inspect_word(row_to_dict(test_set_1, 42), 2))  # Иван е 👉успелия.
print("\n".join(res))

* _Принца_ - considered feminine, which is wrong, let's postpone fixing for now.
* _Дългоочаквания_  - will be fixed by Rule3
* _гост_            - also will be fixed by Rule3
* _госта_           - add to later (not sure yet which rule to apply)
* _съученика_       - add to later (because root)
* _добрия_          - add to later (because root)
* _успелия_         - Stanza thinks this is a VERB, which is wrong, so let's add to the list of "will not fix"

In [None]:
# keep the lists of passed and current tests up to date
idx_failed = [26, 36, 38, 40, 42]
idx_passed = sorted(set(idx_passed + p))
idx_passed = sorted(list(set(idx_passed) - set(idx_failed)))
idx_current = sorted(list(set(idx_current) - set(p)))
idx_current = sorted(list(set(idx_current) - set(idx_failed)))

#### 6.4.3 Regression Test

Test the previously successful sentences to determine if Rule2 caused any breakages. Perform the regression using all rules defined till now (Rule1, Rule2)

In [None]:
_ = test_definite_article_all(test_set_1.loc[idx_passed], [cond_rule1, cond_rule2], [rule1, rule2], print_passed=False)

##### Analyze results from the regression

In [None]:
res = []
res.append(inspect_word(row_to_dict(test_set_1, 37), 2))  # Иван, 👉гостът от Сопот, дойде.
res.append(inspect_word(row_to_dict(test_set_1, 39), 2))  # Иван е 👉съученикът ми.
res.append(inspect_word(row_to_dict(test_set_1, 59), 3))  # Приятелят ни е 👉лекарят.
res.append(inspect_word(row_to_dict(test_set_1, 61), 3))  # Ангел Иванчев, 👉лекарят, живее тука.
res.append(inspect_word(row_to_dict(test_set_1, 63), 6))  # Приятелят ни е Ангел Иванчев, 👉лекарят.
print("\n".join(res))

When we implement the next rules these failures should be fixed. For now let's move them to the list with failed tests. We'll get back to them later.

In [None]:
idx_failed = [37, 39, 59, 61, 63]
idx_passed = sorted(list(set(idx_passed) - set(idx_failed)))

### 6.5 Iteration 3

#### 6.5.1 Implement Rule3

Rule3 states that adjectives, numerals, participles and possessive pronouns take the same article as the noun they agree with. Let's first implement the rules for adjectives.

In [None]:
def rule3(token1, token2, idx_token):
    errors = []

    # Дългоочаквания гост
    if token1.endswith(("я")):  # прилагателното трябва да е с пълен член
        errors.append(token1)

    # Дългоочакваният гост
    if token2.endswith(("ът", "ят")):  # съществителното трябва да е с непълен член
        errors.append(token2)

    return errors

#### 6.5.2 Test Rule3

In [None]:
conditions = [cond_rule1, cond_rule2, cond_rule3]
rules = [rule1, rule2, rule3]
p, c = test_definite_article_all(test_set_1.loc[idx_current], conditions, rules)

##### Analyze the test

In [None]:
res = []
res.append(inspect_word(row_to_dict(test_set_1, 44), 3))  # Пешо се оказа 👉най-верния му приятел.
res.append(inspect_word(row_to_dict(test_set_1, 46), 3))  # Пешо се оказа 👉дарителя на училището.
res.append(inspect_word(row_to_dict(test_set_1, 48), 5))  # Ученикът на първия чин изглежда 👉най-доволния от всички.
res.append(inspect_word(row_to_dict(test_set_1, 50), 0))  # 👉Малкия, заповядай едно бонбонче!
res.append(inspect_word(row_to_dict(test_set_1, 58), 0))  # 👉Приятеля ни е 👉лекаря.
res.append(inspect_word(row_to_dict(test_set_1, 60), 3))  # Ангел Иванчев, 👉лекаря, живее тука.
res.append(inspect_word(row_to_dict(test_set_1, 62), 0))  # 👉Приятеля ни е Ангел Иванчев, 👉лекаря.
print("\n".join(res))

* 44 _най-верния_ - should be fixed by Rule4
* 46 _дарителя_ - should be fixed by Rule4
* 48 _най_доволния_ - should be fixed by Rule4
* 50 _Малкия_ - should be fixed by Rule_6
* 58 _Приятеля_ on row 58 - should be fixed by Rule_7
* 60 _лекаря_ - should be fixed by Rule_7
* 62 _Приятеля_ on row 62 - should be fixed by Rule_7

In [None]:
# keep the lists with passing and current tests up to date
idx_failed = [50, 58, 60, 62]
idx_passed = sorted(set(idx_passed + p))
idx_passed = sorted(list(set(idx_passed) - set(idx_failed)))
idx_current = sorted(list(set(idx_current) - set(p)))
idx_current = sorted(list(set(idx_current) - set(idx_failed)))

#### 6.5.3 Regression Test

Ensure that we have not violated Rule 1 and Rule 2:

In [None]:
_ = test_definite_article_all(test_set_1.loc[idx_passed], conditions, rules, print_passed=False)

### 6.6 Iteration 4

#### 6.6.1 Implement Rule4

According to Rule4, the full definite article should be used when a noun is after verbs like _съм_, _бъда_, _оказвам се_, _изглеждам_, etc. In order to check the previous verb, we need to check the _lemma_ of the verb.

First let's see what the previously failed sentences look like.

In [None]:
inspect_spacy_doc(nlp_st(test_set_1.loc[39, "text"]))
inspect_spacy_doc(nlp_st(test_set_1.loc[44, "text"]))
inspect_spacy_doc(nlp_st(test_set_1.loc[46, "text"]))
inspect_spacy_doc(nlp_st(test_set_1.loc[48, "text"]))

<img src="images/rule4_pattern_new.png" width="800">

The above pattern shows that the first token is a verb (including auxiliary), while the second token is either a noun or an adjective, both of which must be in masculine singular form.

#### 6.6.2 Test Rule4

In [None]:
conditions = [cond_rule1, cond_rule2, cond_rule3, cond_rule4]
rules = [rule1, rule2, rule3, rule4]

p, c = test_definite_article_all(test_set_1.loc[[39, 44, 46, 48]], conditions, rules)

##### Analyze

We expected row 39 to pass but it is still failing. This is likely due to one of the existing rules. We should run tests on each rule individually to identify which one is causing the issue, specifically the one that returns the word _съученикът_.

In [None]:
p, c = test_definite_article_all(test_set_1.loc[[39]], [cond_rule1], [rule1], print_passed=False)

In [None]:
p, c = test_definite_article_all(test_set_1.loc[[39]], [cond_rule2], [rule2], print_passed=False)

Row 39 doesn't pass due to Rule2. In Rule4 we checked whether the previous token was AUX or a VERB, so in Rule2 we need to introduce an exception that specifies the previous token cannot be a VERB or AUX. Otherwise we will have two rules acting on the same pattern.

#### 6.6.3 Update Rule2 and test again

In [None]:
# keep the old rule if we need to test with it
cond_rule2_prev = cond_rule2
rule2_prev = rule2


def cond_rule2(row_dict, i):
    """Search for either:
        - single, masculine NOUN, not subject, not preceded by a verb
        - preceding token is a verb but different from съм, оказвам се, излгежда, etc."""
    is_token1_noun = nlp_get_pos(row_dict, i) == "NOUN"
    is_token1_verb = nlp_get_pos(row_dict, i) == "VERB"
    is_token1_aux = nlp_get_pos(row_dict, i) == "AUX"
    is_token1_masc_sg = is_masculine_singular(row_dict, i)
    is_token1_subj = is_dep_subject(row_dict, i)

    is_token2_noun = nlp_get_pos(row_dict, i + 1) == "NOUN"
    is_token2_subj = is_dep_subject(row_dict, i + 1)
    is_token2_masc_sg = is_masculine_singular(row_dict, i + 1)

    # if token is masculine single noun and !subj, and the previous token is not a verb
    if i == 0 and is_token1_noun and is_token1_masc_sg and not is_token1_subj:
        return True

    is_like_verb = is_token1_verb or is_token1_aux

    if i != 0 and is_token2_noun and is_token2_masc_sg and not is_token2_subj:
        lemma = nlp_get_lemma(row_dict, i)

        is_lemma_aux_syn = lemma in ("е", "съм", "окажа-(се)", "изглежда")
        is_non_aux_syn = not is_like_verb or (is_like_verb and not is_lemma_aux_syn)
        if is_non_aux_syn:
            return True

    return False

In [None]:
def rule2(token1, token2, token_idx):
    errors = []
    word = token1 if token_idx == 0 else token2
    if word.endswith(("ът", "ят")):  # objects should not have full article
        errors.append(word)

    return errors

First test with a small set:

In [None]:
conditions = [cond_rule1, cond_rule2, cond_rule3, cond_rule4]
rules = [rule1, rule2, rule3, rule4]
p, c = test_definite_article_all(test_set_1.loc[[39, 44, 46, 48]], conditions, rules)

Next test with the remaining rows:

In [None]:
p, c = test_definite_article_all(test_set_1.loc[idx_current], conditions, rules)

##### Analyze the remaining failures

In [None]:
inspect_spacy_doc(nlp_st(test_set_1.loc[66, "text"]))
inspect_spacy_doc(nlp_st(test_set_1.loc[68, "text"]))

* 66 _новият бял блок_ and 68 _Високия бял блок_ - the pattern ADJ/ADJ/NOUN points to Rule5.
* 72 and 73 depend on the order of words. We don't know who is the doer of the action in those sentences. Let's postpone fixing them for now.

In [None]:
idx_failed = [72, 73]
idx_passed = sorted(set(idx_passed + p))
idx_passed = sorted(list(set(idx_passed) - set(idx_failed)))
idx_current = sorted(list(set(idx_current) - set(p)))
idx_current = sorted(list(set(idx_current) - set(idx_failed)))

#### 6.6.4 Regression Test

In [None]:
_ = test_definite_article_all(test_set_1.loc[idx_passed], conditions, rules, print_passed=False)

### 6.7 Iteration 5

#### 6.7.1 Implement Rule5

Rule5 says that when there are two or more adjectives in front of the noun, only the first adjective takes the definite article, which could be short or long. In the example with rows 66 and 68 which were analyzed in the previous section, we see that the two phrases _новият бял блок_ and _високия бял блок_ which are identical from grammatical standpoint, cannot be distinguished just by looking at their part of speech. We need to also analyze their relation to the entire sentence. The difference here is that in the first case Stanza correctly determined the noun _блок_ as an indirect object, therefore requiring the short form of the definite article, and the same word in the second sentence as nominal subject, requiring full definite article.

<img src="images/rule7_pattern_new.png" width="650">

Although we expressed the condition verbally as one rule, we need to create two sets of functions for the cases of short and long definite article.

In [None]:
# The adjectives, as well as the pronouns, the ordinal numerals etc., used as attributes in the sentence
# are usually placed in front of the nouns they qualify. In this case, the definite article, if needed,
# is joined to the _first_ attribute of the noun phrase. [4],[5]
def cond_rule5_helper(row_dict, i):
    """
    POS: ADJ / ADJ / NOUN iobj,  all masc. singular -> first ADJ must have def. art. short form
    - or -
    POS: ADJ / ADJ / NOUN nsubj, all masc. singular -> first ADJ must have def. art. long form
    """
    if i >= row_dict["num_tokens"] - 2:
        return False

    is_token1_masc_sg = is_masculine_singular(row_dict, i)
    is_token1_adj = nlp_get_pos(row_dict, i) == "ADJ"

    is_token2_masc_sg = is_masculine_singular(row_dict, i + 1)
    is_token2_adj = nlp_get_pos(row_dict, i + 1) == "ADJ"

    is_token3_masc_sg = is_masculine_singular(row_dict, i + 2)
    is_token3_noun = nlp_get_pos(row_dict, i + 2) == "NOUN"

    if is_token1_masc_sg and is_token2_masc_sg and is_token3_masc_sg:
        if is_token1_adj and is_token2_adj and is_token3_noun:
            return True

    return False

In [None]:
def cond_rule5_short(row_dict, i):
    if cond_rule5_helper(row_dict, i):
        noun_dep = nlp_get_dep(row_dict, i + 2)
        if noun_dep in ("iobj"):
            return True

    return False

In [None]:
def cond_rule5_long(row_dict, i):
    if cond_rule5_helper(row_dict, i):
        noun_dep = nlp_get_dep(row_dict, i + 2)
        if noun_dep in ("nsubj"):
            return True

    return False

In [None]:
def rule5_short(token1, token2, token_idx):
    errors = []

    # Аз живея в новият бял блок.
    if not token1.endswith(("я")):  # първото прилагателно в групата трябва да е с непълен член
        errors.append(token1)

    return errors

In [None]:
def rule5_long(token1, token2, token_idx):
    errors = []

    # Високия бял блок е нов.
    if not token1.endswith(("ят")):  # първото прилагателно в групата трябва да е с пълен член
        errors.append(token1)

    return errors

#### 6.7.2 Test Rule5

In [None]:
conditions = [cond_rule1, cond_rule2, cond_rule3, cond_rule4, cond_rule5_short, cond_rule5_long]
rules = [rule1, rule2, rule3, rule4, rule5_short, rule5_long]
p, c = test_definite_article_all(test_set_1.loc[idx_current], conditions, rules)

#### 6.7.3 Regression test

In [None]:
_ = test_definite_article_all(test_set_1.loc[idx_passed], conditions, rules, print_passed=False)

#### 6.7.4 Final test with the whole set

In [None]:
_ = test_definite_article_all(test_set_1, conditions, rules)

We didn't predict correctly the usage of the definite article in 13 out of the 78 sentences. In some cases it is due to the NLP tag being incorrect, for example the case where the gender of the noun was tagged wrongly by Stanza. In other case it was because we still haven't implemented Rule7.

## 7 Test with real data

We will work with the dataset [bulgarian-grammar-mistakes](https://huggingface.co/datasets/thebogko/bulgarian-grammar-mistakes) from huggingface. The data was originally collected from articles from Bulgarian Wikipedia as well as rows from OSCAR's Bulgarian datasets.

### 7.1 Load and prepare the dataset

In [None]:
grammar_errors = pd.read_csv("https://raw.githubusercontent.com/MirkaIvanova/datasets/refs/heads/main/data_science_project/grammar_errors_original.csv")

In [None]:
grammar_errors.shape

In [None]:
grammar_errors.head(5)

#### 7.1.1 Filter only errors related to article misuse:

In [None]:
grammar_errors.error_type.unique()

In [None]:
grammar_errors = grammar_errors[grammar_errors["error_type"] == "article_misuse"]

In [None]:
grammar_errors.shape

#### 7.1.2 Rename column headings

In [None]:
grammar_errors = grammar_errors.rename(columns={"erroneous": "incorrect"})

#### 7.1.3 Add a column with differing words

In [None]:
grammar_errors[['incorrect', 'correct']] = grammar_errors[['incorrect', 'correct']].map(str.strip)
grammar_errors[["correct_words", "incorrect_words"]] = grammar_errors.apply(get_differing_words, axis=1, result_type="expand")

In [None]:
grammar_errors.head(3)

#### 7.1.4 Melt the dataset and sort pairs of correct/incorrect sentences to be together

In [None]:
grammar_errors.shape

In [None]:
grammar_errors = melt_and_sort_sentences(grammar_errors)

In [None]:
grammar_errors.shape

In [None]:
grammar_errors.head(4)

#### 7.1.5 Add NLP tags

<div style="background-color:bisque">⚠️Note that calculating the features using Stanza takes around 1 hour on a laptop with average specs, therefore here we are loading a pre-saved file.

In [None]:
from copy import deepcopy

In [None]:
grammar_errors.columns

In [None]:
grammar_errors_10 = deepcopy(grammar_errors.iloc[0:100])
grammar_errors.shape, grammar_errors_10.shape

In [None]:
start_time = time.time()

feature_columns_st = ["pos", "tag", "dep", "morph", "lemmas", "left_edge", "right_edge", "num_tokens"]
grammar_errors_10[feature_columns_st] = grammar_errors_10.apply(lambda r: extract_features(nlp_st, r, "text"), axis=1, result_type="expand")
grammar_errors_10.to_csv("data/grammar_errors_10.csv", index=None)

end_time = time.time()
execution_time_st = end_time - start_time
print(f"Execution time (stanza): {execution_time_st} seconds")

In [None]:
grammar_errors_10

#### 7.1.6 Filter only rows with 1 sentence, no quotes and other special characters

We need to filter out such texts since our rules can't deal with quoted text inside a sentence. Special characters also cause issues.

In [None]:
grammar_errors = grammar_errors[grammar_errors["n_sents"] == 1]

In [None]:
pattern = r"[\'\"‘’“”°/≈\:]"
grammar_errors = grammar_errors[~grammar_errors['text'].str.contains(pattern)]

In [None]:
grammar_errors.shape

### 7.2 Test with small subset of the real data

In [None]:
grammar_errors_10tokens = grammar_errors[grammar_errors["num_tokens"] < 10]

In [None]:
# execution will take 30 sec
passed, failed = test_definite_article_all(grammar_errors_10tokens, conditions, rules, print_passed=False, print_failed=True)

Let's review the outcomes.

**To begin with**, the rules failed to detect errors in nearly a quarter (24%) of the tests.
However, upon closer inspection of the failures, a pattern emerges. Upon closer look, it is evident that most of the failed tests have even-numbered indices. This suggests that while correctly formed sentences were accurately identified, not all erroneous sentences were successfully detected.

Let's take a closer look at the instances where the tests failed for sentences that were initially correct:

In [None]:
grammar_errors.loc[[338, 339, 2022, 2023, 3396, 3397, 3942, 3943 ]][['text', 'is_correct', 'incorrect_words', 'correct_words']]

The sentence _Да, това е Учителят_ is listed as both correct and incorrect.

The sentence _Администратора на форума така е решил_ is listed as correct, but in fact it is not.

The sentence _Това е успокоителния хап за съвестта му_ is also listed as correct, but in fact it is incorrect.

Therefore, in at least these 3 examples, the rules correctly identified the error, but the expected result was wrong bcause the original data in the dataset was wrong.

The assumption about the original wrong data is, that the data was scraped from various sources and considered "correct", then errors were automatically introduced to produce the incorrect version. However, it's important to note that the source material itself may not always be grammatically accurate. For exmaple, the sentence _Това е успокоителния хап за съвестта му_ can be found [in a blog comment from 2010](https://petdoshkov.blog.bg/drugi/2010/05/10/vyzzivnoto-reshenie-za-nakazanieto-zabelejka.542411). This example demonstrates that even the original text contained grammatical errors, challenging the assumption that the initial data was entirely correct before deliberate mistakes were added.




**Another reason** of the many failures is that we still haven't implemented all rules listed in the Grammas section. Additionally, we tested on a small set of 39 pairs of simple sentences so we didn't verify our "model" with enough data. Another explanation is that in the NLP features we extracted there are two still unused ones - left edge and right edge.

### 7.3 Test with the whole dataset

In [None]:
grammar_errors.shape

<div style="background-color:Bisque">

⚠️The below test will take ~ 18 minutes. As an alternative to executing it you may take a look at the screenshot.

In [None]:
if False:
    start_time = time.time()

    passed, failed = test_definite_article_all(grammar_errors, conditions, rules, print_passed=False, print_failed=False)

    end_time = time.time()
    execution_time_st = end_time - start_time
    print(f"Execution time (stanza): {execution_time_st} seconds")

Screenshot of the result:

<img src="images/whole_dataset_result.png">

Testing with the whole dataset we see that one third of the tests failed. Certainly there is room for improvement!

## Summary and insights gained

Although the results of the final test were not as encouraging as hoped, the project still yielded valuable insights.

Initially, selecting the appropriate NLP library was a challenging task. While UDPipe offered good performance, it lacked accuracy, and although Stanza delivered better results, it was much slower. This experience highlighted the importance of carefully balancing accuracy and performance when choosing tools for linguistic analysis.

Another key realization was the unexpected complexity of Bulgarian grammar, particularly the rules surrounding the use of the definite article. This complexity necessitated a deeper investigation into the language's linguistic structures.

Parsing text to detect incorrect definite articles also proved to be more complicated than expected. The development of effective rules for this task was hindered by the diversity of sentence structures encountered.

Additionally, the testing dataset posed its own challenges, as it contained inaccuracies that affected the validation process. This emphasized the crucial need for high-quality, accurate datasets.

Despite these obstacles, the implementation of rule-based methods produced promising results, especially in identifying errors in shorter sentences. This success demonstrated that even in the face of linguistic complexity, well-designed rules are essential for achieving accurate outcomes.

The insights gained from this effort will undoubtedly contribute to the broader field of computational linguistics and inspire more accurate and efficient solutions for grammatical analysis in Bulgarian. As we continue to refine our approach and expand our rule set, we are confident in our ability to develop a robust and reliable tool for checking the correctness of definite articles in Bulgarian texts.

**References**

<div id="ref1">[1] Astoria Academy, <a href="https://astoria-academy.com/the-definite-articles-of-bulgarian/">The Definite articles of Bulgarian,</a> 2023</div>

<div id="ref2">[2] CoLanguage <a href="https://www.colanguage.com/definite-article-bulgarian-nouns">Definite article of the Bulgarian nouns</a></div>

<div id="ref3">[3] в. „Аз Буки“ бр. 16 <a href="https://ibl.bas.bg/ezikovi_spravki/otnovo-za-palniya-i-kratkiya-tchlen/">Отново за пълния и краткия член</a></div>

<div id="ref4">[4] Raquel Jacob <a href="https://help.unbabel.com/hc/en-us/articles/360022878854-Language-Guidelines-Bulgarian">Language Guidelines – Bulgarian</a></div>

<div id="ref5">[5] Andonova, Sabeva, Zagorova <a href="https://caritas.bg/cms/wp-content/uploads/2015/04/A1-English.pdf?x10535">Bulgarian for Refugees,</a> 2014</div>

<div id="ref6">[6] John Leafgren <a href="http://www.seelrc.org:8080/grammar/pdf/stand_alone_bulgarian.pdf">A Concise Bulgarian
Grammar</a></div>

<div id="ref7">[7] G. Popova <a href="https://www.english-linguistics.de/archives/clark/SIMOV/CM/popova.pdf">Towards an HPSG Account of the
Bulgarian Definite Article</a></div>

<div id="ref8">[8] K. Bontcheva <a href="https://theswissbay.ch/pdf/Books/Linguistics/Mega%20linguistics%20pack/Indo-European/Balto-Slavic/Bulgarian%20Grammar%2C%20Elementary%20%28Bontcheva%29.pdf">Bulgarian Language - Grammar</a></div>