In [2]:
# !pip install ufal.udpipe
# !pip install spacy-stanza
!pip install --upgrade spacy-stanza
!pip install stanza

Collecting spacy-stanza
  Using cached spacy_stanza-1.0.4-py3-none-any.whl.metadata (8.6 kB)
Collecting stanza<1.7.0,>=1.2.0 (from spacy-stanza)
  Using cached stanza-1.6.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza<1.7.0,>=1.2.0->spacy-stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Using cached spacy_stanza-1.0.4-py3-none-any.whl (9.7 kB)
Using cached stanza-1.6.1-py3-none-any.whl (881 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
   ---------------------------------------- 0.0/590.6 kB ? eta -:--:--
   --------------------------------------- 590.6/590.6 kB 10.6 MB/s eta 0:00:00
Installing collected packages: emoji, stanza, spacy-stanza
Successfully installed emoji-2.14.1 spacy-stanza-1.0.4 stanza-1.6.1


In [3]:
from spacy.tokens import Doc
# from ufal.udpipe import Model, Pipeline

import os
import pandas as pd
import re
import spacy
import spacy_stanza
import stanza
import time
import urllib.request
import warnings

In [4]:
# Suppress specific warnings
warnings.filterwarnings("ignore", category=FutureWarning, message=".*torch.load.*")

## 3 Experiments with Stanza and UDPipe

### 3.1 Download and initialize Bulgarian models

In [5]:
# Download Stanza
if not os.path.isdir("./stanza_resources/bg"):
    stanza.download("bg", model_dir="./stanza_resources")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   ‚Ä¶

2025-01-22 15:35:57 INFO: Downloading default packages for language: bg (Bulgarian) ...


Downloading https://huggingface.co/stanfordnlp/stanza-bg/resolve/v1.6.0/models/default.zip:   0%|          | 0‚Ä¶

2025-01-22 15:36:40 INFO: Finished downloading models and saved to ./stanza_resources.


In [6]:
processors = "tokenize,pos,lemma"  # A string with comma-separated processor names

# Initialize the pipeline
nlp_spacy_stanza = spacy_stanza.load_pipeline(
    "bg",
    dir="./stanza_resources",
    processors=processors
)

# nlp_spacy_stanza = spacy_stanza.load_pipeline("bg", dir="./stanza_resources")

2025-01-22 15:36:44 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   ‚Ä¶

2025-01-22 15:36:45 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package      |
----------------------------
| tokenize  | btb          |
| pos       | btb_charlm   |
| lemma     | btb_nocharlm |

2025-01-22 15:36:45 INFO: Using device: cpu
2025-01-22 15:36:45 INFO: Loading: tokenize
2025-01-22 15:36:47 INFO: Loading: pos
2025-01-22 15:36:48 INFO: Loading: lemma
2025-01-22 15:36:48 INFO: Done loading processors!


### 3.2 Define initial functions

In [7]:
def nlp_st(txt):
    return nlp_spacy_stanza(txt)

### 3.3 Compare NLP features

In [8]:
# A helper function for displaying useful NLP features in easy to read format
def inspect_spacy_doc(doc):
    for token in doc:
        print(f"Token: {token.text:<15} Tag: {token.tag_:<15} POS: {token.pos_:<10} Lemma: {token.lemma_:<15} Dep: {token.dep_:<10}")
    print("\n")

In [9]:
sentence = """–°—Ä–µ–¥ –≥–æ—Å—Ç–∏—Ç–µ –Ω–∞ –æ—Ñ–∏—Ü–∏–∞–ª–Ω–∞—Ç–∞ —Ü–µ—Ä–µ–º–æ–Ω–∏—è –ø–æ –≤—Å—Ç—ä–ø–≤–∞–Ω–µ –≤ –¥–ª—ä–∂–Ω–æ—Å—Ç –Ω–∞ –Ω–æ–≤–æ–∏–∑–±—Ä–∞–Ω–∏—è –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç —â–µ –±—ä–¥–∞—Ç –±—ä–ª–≥–∞—Ä—Å–∫–∏—è—Ç \
–¥—ä—Ä–∂–∞–≤–µ–Ω –≥–ª–∞–≤–∞ –†—É–º–µ–Ω –†–∞–¥–µ–≤, –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç—ä—Ç –Ω–∞ –ê–ª–±–∞–Ω–∏—è –ò–ª–∏—Ä –ú–µ—Ç–∞ –∏ –Ω–∞ –ö–æ—Å–æ–≤–æ –•–∞—à–∏–º –¢–∞—á–∏."""

### 3.5 Compare performance with large amount of data

In [12]:
# Create a DataFrame with 10 rows, use same sentence for simplicity
test_df = pd.DataFrame({"text": [sentence] * 10})

In [13]:
def sent_features_to_string(sent):
    """Converts sentence features to strings"""
    words = [tk.pos_ for tk in sent]
    lemmas = [(tk.lemma_) for tk in sent]
    pos = [tk.pos_ for tk in sent]
    tag = [tk.tag_ for tk in sent]    # morph?
    morph = [str(tk.morph) for tk in sent] # features?
    dep = [tk.dep_ for tk in sent]
    n_words = len(sent)

    return words, lemmas, pos, tag, morph, dep, n_words



def extract_features(nlp, row, col):
    """Extracts linguistic features from a text column using a given NLP model"""
    value = row[col]
    doc = nlp(value)
    return sent_features_to_string(doc)

#### 3.5.1 Extracting features using Stanza

In [14]:
start_time = time.time()

test_df[["words_s", "lemmas_s", "pos_s", "morph_s", "features_s", "dep_s", "n_words_s"]] = test_df.apply(
    lambda r: extract_features(nlp_st, r, "text"), axis=1, result_type="expand"
)

end_time = time.time()
execution_time_st = end_time - start_time

In [15]:
print(f"Execution time (stanza): {execution_time_st} seconds")

Execution time (stanza): 6.670640468597412 seconds


In [42]:
test_df

Unnamed: 0,text,pos,tag,dep,morph,lemmas,left_edge,right_edge,num_words,words_s,lemmas_s,pos_s,morph_s,features_s,dep_s,n_words_s
0,–°—Ä–µ–¥ –≥–æ—Å—Ç–∏—Ç–µ –Ω–∞ –æ—Ñ–∏—Ü–∏–∞–ª–Ω–∞—Ç–∞ —Ü–µ—Ä–µ–º–æ–Ω–∏—è –ø–æ –≤—Å—Ç—ä–ø...,1,2,3,4,5,6,7,8,"[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[—Å—Ä–µ–¥, –≥–æ—Å—Ç, –Ω–∞, –æ—Ñ–∏—Ü–∏–∞–ª–µ–Ω, —Ü–µ—Ä–µ–º–æ–Ω–∏—è, –ø–æ, –≤—Å—Ç...","[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[R, Ncmpd, R, Afsd, Ncfsi, R, Ncnsi, R, Ncfsi,...","[, Definite=Def|Gender=Masc|Number=Plur, , Def...","[case, root, case, amod, nmod, case, nmod, cas...",31
1,–°—Ä–µ–¥ –≥–æ—Å—Ç–∏—Ç–µ –Ω–∞ –æ—Ñ–∏—Ü–∏–∞–ª–Ω–∞—Ç–∞ —Ü–µ—Ä–µ–º–æ–Ω–∏—è –ø–æ –≤—Å—Ç—ä–ø...,1,2,3,4,5,6,7,8,"[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[—Å—Ä–µ–¥, –≥–æ—Å—Ç, –Ω–∞, –æ—Ñ–∏—Ü–∏–∞–ª–µ–Ω, —Ü–µ—Ä–µ–º–æ–Ω–∏—è, –ø–æ, –≤—Å—Ç...","[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[R, Ncmpd, R, Afsd, Ncfsi, R, Ncnsi, R, Ncfsi,...","[, Definite=Def|Gender=Masc|Number=Plur, , Def...","[case, root, case, amod, nmod, case, nmod, cas...",31
2,–°—Ä–µ–¥ –≥–æ—Å—Ç–∏—Ç–µ –Ω–∞ –æ—Ñ–∏—Ü–∏–∞–ª–Ω–∞—Ç–∞ —Ü–µ—Ä–µ–º–æ–Ω–∏—è –ø–æ –≤—Å—Ç—ä–ø...,1,2,3,4,5,6,7,8,"[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[—Å—Ä–µ–¥, –≥–æ—Å—Ç, –Ω–∞, –æ—Ñ–∏—Ü–∏–∞–ª–µ–Ω, —Ü–µ—Ä–µ–º–æ–Ω–∏—è, –ø–æ, –≤—Å—Ç...","[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[R, Ncmpd, R, Afsd, Ncfsi, R, Ncnsi, R, Ncfsi,...","[, Definite=Def|Gender=Masc|Number=Plur, , Def...","[case, root, case, amod, nmod, case, nmod, cas...",31
3,–°—Ä–µ–¥ –≥–æ—Å—Ç–∏—Ç–µ –Ω–∞ –æ—Ñ–∏—Ü–∏–∞–ª–Ω–∞—Ç–∞ —Ü–µ—Ä–µ–º–æ–Ω–∏—è –ø–æ –≤—Å—Ç—ä–ø...,1,2,3,4,5,6,7,8,"[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[—Å—Ä–µ–¥, –≥–æ—Å—Ç, –Ω–∞, –æ—Ñ–∏—Ü–∏–∞–ª–µ–Ω, —Ü–µ—Ä–µ–º–æ–Ω–∏—è, –ø–æ, –≤—Å—Ç...","[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[R, Ncmpd, R, Afsd, Ncfsi, R, Ncnsi, R, Ncfsi,...","[, Definite=Def|Gender=Masc|Number=Plur, , Def...","[case, root, case, amod, nmod, case, nmod, cas...",31
4,–°—Ä–µ–¥ –≥–æ—Å—Ç–∏—Ç–µ –Ω–∞ –æ—Ñ–∏—Ü–∏–∞–ª–Ω–∞—Ç–∞ —Ü–µ—Ä–µ–º–æ–Ω–∏—è –ø–æ –≤—Å—Ç—ä–ø...,1,2,3,4,5,6,7,8,"[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[—Å—Ä–µ–¥, –≥–æ—Å—Ç, –Ω–∞, –æ—Ñ–∏—Ü–∏–∞–ª–µ–Ω, —Ü–µ—Ä–µ–º–æ–Ω–∏—è, –ø–æ, –≤—Å—Ç...","[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[R, Ncmpd, R, Afsd, Ncfsi, R, Ncnsi, R, Ncfsi,...","[, Definite=Def|Gender=Masc|Number=Plur, , Def...","[case, root, case, amod, nmod, case, nmod, cas...",31
5,–°—Ä–µ–¥ –≥–æ—Å—Ç–∏—Ç–µ –Ω–∞ –æ—Ñ–∏—Ü–∏–∞–ª–Ω–∞—Ç–∞ —Ü–µ—Ä–µ–º–æ–Ω–∏—è –ø–æ –≤—Å—Ç—ä–ø...,1,2,3,4,5,6,7,8,"[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[—Å—Ä–µ–¥, –≥–æ—Å—Ç, –Ω–∞, –æ—Ñ–∏—Ü–∏–∞–ª–µ–Ω, —Ü–µ—Ä–µ–º–æ–Ω–∏—è, –ø–æ, –≤—Å—Ç...","[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[R, Ncmpd, R, Afsd, Ncfsi, R, Ncnsi, R, Ncfsi,...","[, Definite=Def|Gender=Masc|Number=Plur, , Def...","[case, root, case, amod, nmod, case, nmod, cas...",31
6,–°—Ä–µ–¥ –≥–æ—Å—Ç–∏—Ç–µ –Ω–∞ –æ—Ñ–∏—Ü–∏–∞–ª–Ω–∞—Ç–∞ —Ü–µ—Ä–µ–º–æ–Ω–∏—è –ø–æ –≤—Å—Ç—ä–ø...,1,2,3,4,5,6,7,8,"[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[—Å—Ä–µ–¥, –≥–æ—Å—Ç, –Ω–∞, –æ—Ñ–∏—Ü–∏–∞–ª–µ–Ω, —Ü–µ—Ä–µ–º–æ–Ω–∏—è, –ø–æ, –≤—Å—Ç...","[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[R, Ncmpd, R, Afsd, Ncfsi, R, Ncnsi, R, Ncfsi,...","[, Definite=Def|Gender=Masc|Number=Plur, , Def...","[case, root, case, amod, nmod, case, nmod, cas...",31
7,–°—Ä–µ–¥ –≥–æ—Å—Ç–∏—Ç–µ –Ω–∞ –æ—Ñ–∏—Ü–∏–∞–ª–Ω–∞—Ç–∞ —Ü–µ—Ä–µ–º–æ–Ω–∏—è –ø–æ –≤—Å—Ç—ä–ø...,1,2,3,4,5,6,7,8,"[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[—Å—Ä–µ–¥, –≥–æ—Å—Ç, –Ω–∞, –æ—Ñ–∏—Ü–∏–∞–ª–µ–Ω, —Ü–µ—Ä–µ–º–æ–Ω–∏—è, –ø–æ, –≤—Å—Ç...","[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[R, Ncmpd, R, Afsd, Ncfsi, R, Ncnsi, R, Ncfsi,...","[, Definite=Def|Gender=Masc|Number=Plur, , Def...","[case, root, case, amod, nmod, case, nmod, cas...",31
8,–°—Ä–µ–¥ –≥–æ—Å—Ç–∏—Ç–µ –Ω–∞ –æ—Ñ–∏—Ü–∏–∞–ª–Ω–∞—Ç–∞ —Ü–µ—Ä–µ–º–æ–Ω–∏—è –ø–æ –≤—Å—Ç—ä–ø...,1,2,3,4,5,6,7,8,"[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[—Å—Ä–µ–¥, –≥–æ—Å—Ç, –Ω–∞, –æ—Ñ–∏—Ü–∏–∞–ª–µ–Ω, —Ü–µ—Ä–µ–º–æ–Ω–∏—è, –ø–æ, –≤—Å—Ç...","[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[R, Ncmpd, R, Afsd, Ncfsi, R, Ncnsi, R, Ncfsi,...","[, Definite=Def|Gender=Masc|Number=Plur, , Def...","[case, root, case, amod, nmod, case, nmod, cas...",31
9,–°—Ä–µ–¥ –≥–æ—Å—Ç–∏—Ç–µ –Ω–∞ –æ—Ñ–∏—Ü–∏–∞–ª–Ω–∞—Ç–∞ —Ü–µ—Ä–µ–º–æ–Ω–∏—è –ø–æ –≤—Å—Ç—ä–ø...,1,2,3,4,5,6,7,8,"[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[—Å—Ä–µ–¥, –≥–æ—Å—Ç, –Ω–∞, –æ—Ñ–∏—Ü–∏–∞–ª–µ–Ω, —Ü–µ—Ä–µ–º–æ–Ω–∏—è, –ø–æ, –≤—Å—Ç...","[ADP, NOUN, ADP, ADJ, NOUN, ADP, NOUN, ADP, NO...","[R, Ncmpd, R, Afsd, Ncfsi, R, Ncnsi, R, Ncfsi,...","[, Definite=Def|Gender=Masc|Number=Plur, , Def...","[case, root, case, amod, nmod, case, nmod, cas...",31


## 4 Proof of concept

### 4.1 Define one rule

Define one simple rule and put it in a testing function. The rule is: if the word is a noun, is preceded by a preposition, and ends in _—ä—Ç_ or _—è—Ç_, then the long form of the definite article is incorrect. Note that in reality this rule is not enough to determine the correct usage of the definite article but for the purposes of the POC it will suffice.

In [None]:
def verify_definite_article(doc):
    """Detects incorrect usage of the definite article based on POS tags and word endings"""
    errors = []

    for i, token in enumerate(doc):
        # Check if the token is a noun with a definite article (full form)
        if token.pos_ == "NOUN" and token.text.endswith(("—ä—Ç", "—è—Ç")):
            # Check if the preceding token is a preposition (ADP), like "–≤"
            if i > 0 and doc[i - 1].pos_ == "ADP":
                errors.append(f"Incorrect usage of full definite article: '{token.text}' in sentence: '{doc.text}'")
    return errors

### 4.2 Test and validate

In [None]:
valid_sentences = ["–ê–∑ –æ—Ç–∏–≤–∞–º –≤ –æ—Ñ–∏—Å–∞.", "–ò–≤–∞–Ω –æ—Ç–∏–≤–∞ –≤ –æ—Ñ–∏—Å–∞.", "–û—Ñ–∏—Å—ä—Ç –µ –≤ –°–æ—Ñ–∏—è."]
invalid_sentences = ["–ê–∑ –æ—Ç–∏–≤–∞–º –≤ –æ—Ñ–∏—Å—ä—Ç.", "–ò–≤–∞–Ω –æ—Ç–∏–≤–∞ –≤ –æ—Ñ–∏—Å—ä—Ç.", "–û—Ñ–∏—Å–∞ –µ –≤ –°–æ—Ñ–∏—è."]

In [None]:
# Process each sentence with both libraries
results_st = []
results_ud = []
for sentence in valid_sentences + invalid_sentences:
    doc_st = nlp_st(sentence)
    doc_ud = nlp_ud(sentence)

    results_st.append(verify_definite_article(doc_st))
    results_ud.append(verify_definite_article(doc_ud))

In [None]:
print("Results from Stanza:")
for i, res in enumerate(results_st):
    print(f"{i+1}:", res)

print("\nResults from UDPipe:")
for i, res in enumerate(results_ud):
    print(f"{i+1}:", res)

Results from Stanza:
1: []
2: []
3: []
4: ["Incorrect usage of full definite article: '–æ—Ñ–∏—Å—ä—Ç' in sentence: '–ê–∑ –æ—Ç–∏–≤–∞–º –≤ –æ—Ñ–∏—Å—ä—Ç.'"]
5: ["Incorrect usage of full definite article: '–æ—Ñ–∏—Å—ä—Ç' in sentence: '–ò–≤–∞–Ω –æ—Ç–∏–≤–∞ –≤ –æ—Ñ–∏—Å—ä—Ç.'"]
6: []

Results from UDPipe:
1: []
2: []
3: []
4: ["Incorrect usage of full definite article: '–æ—Ñ–∏—Å—ä—Ç' in sentence: '–ê–∑ –æ—Ç–∏–≤–∞–º –≤ –æ—Ñ–∏—Å—ä—Ç. '"]
5: ["Incorrect usage of full definite article: '–æ—Ñ–∏—Å—ä—Ç' in sentence: '–ò–≤–∞–Ω –æ—Ç–∏–≤–∞ –≤ –æ—Ñ–∏—Å—ä—Ç. '"]
6: []


Of course we can't expect much correctness of this simple rule but we can see that it did find some incorrect usages in two out of the six cases. Not bad, actually!

## 5 Load and clean the testing data

This short set of sentences contains some common misuses of the definite article and also the correct usage. We will use this set for initial testing of the rules we develop.

In [None]:
test_set_1 = pd.read_csv("data/test_set_1.csv")

In [None]:
test_set_1.head()

Unnamed: 0,error_type,incorrect,correct
0,article_misuse,–ö–ª—é—á–∞ –µ –Ω–∞ –º–∞—Å–∞—Ç–∞.,–ö–ª—é—á—ä—Ç –µ –Ω–∞ –º–∞—Å–∞—Ç–∞.
1,article_misuse,–¶–∞—Ä—è –ø–∏–µ –≤–∏–Ω–æ.,–¶–∞—Ä—è—Ç –ø–∏–µ –≤–∏–Ω–æ.
2,article_misuse,–£—á–µ–Ω–∏–∫–∞ –µ —É–º–µ–Ω –∏ —Ç—Ä—É–¥–æ–ª—é–±–∏–≤.,–£—á–µ–Ω–∏–∫—ä—Ç –µ —É–º–µ–Ω –∏ —Ç—Ä—É–¥–æ–ª—é–±–∏–≤.
3,article_misuse,–ü—Ä–∏—è—Ç–µ–ª—è –º–∏ –µ –≤ —á—É–∂–±–∏–Ω–∞.,–ü—Ä–∏—è—Ç–µ–ª—è—Ç –º–∏ –µ –≤ —á—É–∂–±–∏–Ω–∞.
4,article_misuse,–ì–æ—Å—Ç–∞ –ø—Ä–∏—Å—Ç–∏–≥–Ω–∞.,–ì–æ—Å—Ç—ä—Ç –ø—Ä–∏—Å—Ç–∏–≥–Ω–∞.


### 5.1 Trim the sentences of whitespaces

In [None]:
test_set_1[['incorrect', 'correct']] = test_set_1[['incorrect', 'correct']].map(str.strip)

### 5.2 Add column with the differing words

In order to perform automated tests we will need to know were exactly is the error in the sentence, if there is an error. Therefore, we'll add a new column with the expected incorrect words. Before that we need to do some cleaning like removal of punctuation. Note that this function is designed to work with Bulgarian in order to keep words üü†

In [None]:
def clean_sentence_bg(sentence):
    """Cleans a Bulgarian sentence by removing unwanted punctuation but preserving valid dashes."""
    # Remove dashes not surrounded by exactly two bg letters
    # keep also numbers in order to preserve words like 5-–≥–æ–¥–∏—à–µ–Ω
    sentence = re.sub(r"(?<![0-9–∞-—è–ê-–Ø])-|-(?![–∞-—è–ê-–Ø])", "", sentence)

    # Remove all other punctuation except for valid dashes
    sentence = re.sub(r"[^\w\s-]", "", sentence)

    return sentence

In [None]:
def get_differing_words(row):
    """Extracts differing words between two sentences and logs word count discrepancies."""
    # Clean and split the sentences into words
    incorrect_words = clean_sentence_bg(row["incorrect"]).split()
    correct_words = clean_sentence_bg(row["correct"]).split()

    # Log if word count differs
    if len(incorrect_words) != len(correct_words):
        print(f"Word count differs: Incorrect - {len(incorrect_words)} words, Correct - {len(correct_words)} words")

    # Get words from the correct sentence that are not in the incorrect sentence
    differing_correct_words = [word for word in correct_words if word not in incorrect_words]
    differing_incorrect_words = [word for word in incorrect_words if word not in correct_words]

    return ",".join(differing_incorrect_words), ",".join(differing_correct_words)

In [None]:
# add a new column to hold the correct and incorrect words, they will be used for automated testing
test_set_1[["correct_words", "incorrect_words"]] = test_set_1.apply(get_differing_words, axis=1, result_type="expand")

In [None]:
test_set_1.head(3)

Unnamed: 0,error_type,incorrect,correct,correct_words,incorrect_words
0,article_misuse,–ö–ª—é—á–∞ –µ –Ω–∞ –º–∞—Å–∞—Ç–∞.,–ö–ª—é—á—ä—Ç –µ –Ω–∞ –º–∞—Å–∞—Ç–∞.,–ö–ª—é—á–∞,–ö–ª—é—á—ä—Ç
1,article_misuse,–¶–∞—Ä—è –ø–∏–µ –≤–∏–Ω–æ.,–¶–∞—Ä—è—Ç –ø–∏–µ –≤–∏–Ω–æ.,–¶–∞—Ä—è,–¶–∞—Ä—è—Ç
2,article_misuse,–£—á–µ–Ω–∏–∫–∞ –µ —É–º–µ–Ω –∏ —Ç—Ä—É–¥–æ–ª—é–±–∏–≤.,–£—á–µ–Ω–∏–∫—ä—Ç –µ —É–º–µ–Ω –∏ —Ç—Ä—É–¥–æ–ª—é–±–∏–≤.,–£—á–µ–Ω–∏–∫–∞,–£—á–µ–Ω–∏–∫—ä—Ç


### 5.3 Melt and sort

Merge the columns _incorrect_ and _correct_ to a single column _text_. Keep each pair of incorrect and correct sentences together.

In [None]:
def melt_and_sort_sentences(df):
    """Transforms a DataFrame to melt sentence pairs, ensuring incorrect and correct sentences are paired."""

    # Swap 'incorrect_words' and 'correct_words' columns
    df = df.rename(columns={"incorrect_words": "temp_correct_words", "correct_words": "incorrect_words"})
    df = df.rename(columns={"temp_correct_words": "correct_words"})

    # Add an order column to keep track of the original order
    df["order"] = df.index

    # Melt the DataFrame
    df_melted = pd.melt(
        df,
        id_vars=["correct_words", "incorrect_words", "order"],
        value_vars=["incorrect", "correct"],
        var_name="is_correct",
        value_name="text",
    )

    # Set 'is_correct' to True for 'correct' and False for 'incorrect'
    df_melted["is_correct"] = df_melted["is_correct"].apply(lambda x: x == "correct")

    # Assign incorrect_words and correct_words only to incorrect sentences
    df_melted["incorrect_words"] = df_melted.apply(lambda row: row["incorrect_words"] if not row["is_correct"] else "", axis=1)
    df_melted["correct_words"] = df_melted.apply(lambda row: row["correct_words"] if not row["is_correct"] else "", axis=1)

    # Sort by 'order' to ensure the pairs of correct/incorrect sentences are next to each other
    df_melted = df_melted.sort_values(by=["order", "is_correct"], ascending=[True, True])

    # Reset index for clean output
    df_melted.reset_index(drop=True, inplace=True)

    # Drop the 'order' column as it's no longer needed
    df_melted = df_melted.drop(columns=["order"])

    # Reorder columns to match the desired output
    df_melted = df_melted[["text", "is_correct", "incorrect_words", "correct_words"]]

    return df_melted

In [None]:
test_set_1 = melt_and_sort_sentences(test_set_1)

In [None]:
test_set_1.head(6)

Unnamed: 0,text,is_correct,incorrect_words,correct_words
0,–ö–ª—é—á–∞ –µ –Ω–∞ –º–∞—Å–∞—Ç–∞.,False,–ö–ª—é—á–∞,–ö–ª—é—á—ä—Ç
1,–ö–ª—é—á—ä—Ç –µ –Ω–∞ –º–∞—Å–∞—Ç–∞.,True,,
2,–¶–∞—Ä—è –ø–∏–µ –≤–∏–Ω–æ.,False,–¶–∞—Ä—è,–¶–∞—Ä—è—Ç
3,–¶–∞—Ä—è—Ç –ø–∏–µ –≤–∏–Ω–æ.,True,,
4,–£—á–µ–Ω–∏–∫–∞ –µ —É–º–µ–Ω –∏ —Ç—Ä—É–¥–æ–ª—é–±–∏–≤.,False,–£—á–µ–Ω–∏–∫–∞,–£—á–µ–Ω–∏–∫—ä—Ç
5,–£—á–µ–Ω–∏–∫—ä—Ç –µ —É–º–µ–Ω –∏ —Ç—Ä—É–¥–æ–ª—é–±–∏–≤.,True,,


### 5.4 Add columns with Stanza NLP features

In [None]:
if True:
    # extract stanza features as new columns
    feature_columns_st = ["pos", "tag", "dep", "morph", "lemmas", "left_edge", "right_edge", "num_tokens"]
    test_set_1[feature_columns_st] = test_set_1.apply(lambda r: extract_features(nlp_st, r, "text"), axis=1, result_type="expand")

    # save in case we want to load it faster
    test_set_1.to_csv("data/test_set_1_clean.csv", index=None)
else:
    # load from file instead of the above, it is faster
    test_set_1 = pd.read_csv("data/test_set_1_clean.csv")
    test_set_1["correct_words"] = test_set_1["incorrect_words"].fillna("")
    test_set_1["incorrect_words"] = test_set_1["incorrect_words"].fillna("")

In [None]:
test_set_1.columns

Index(['text', 'is_correct', 'incorrect_words', 'correct_words', 'pos', 'tag',
       'dep', 'morph', 'lemmas', 'left_edge', 'right_edge', 'num_tokens'],
      dtype='object')

In [None]:
test_set_1[["text", "num_tokens", "pos", "tag", "dep", "morph", "lemmas", "left_edge", "right_edge"]].head(1).T

Unnamed: 0,0
text,–ö–ª—é—á–∞ –µ –Ω–∞ –º–∞—Å–∞—Ç–∞.
num_tokens,5
pos,"NOUN,AUX,ADP,NOUN,PUNCT"
tag,"Ncmsh,Vxitf-r3s,R,Ncfsd,punct"
dep,"nsubj,cop,case,root,punct"
morph,"Definite=Def|Gender=Masc|Number=Sing,Aspect=Im..."
lemmas,"–∫–ª—é—á,—Å—ä–º,–Ω–∞,–º–∞—Å–∞,."
left_edge,"–ö–ª—é—á–∞,–µ,–Ω–∞,–ö–ª—é—á–∞,."
right_edge,"–ö–ª—é—á–∞,–µ,–Ω–∞,.,."


## 6 Define rule based logic

### 6.1 Implement NLP specific functions

This section contains functions that extract grammatical details like gender, number, and sentence role from tagged words.

In [None]:
def extract_gender_from_tag(tag):
    gender = " "
    # gender is applicable to common noun, proper noun or adjective
    if tag.startswith(("Nc", "Np", "A")):
        if "m" in tag:
            gender = "Masculine"
        elif "f" in tag:
            gender = "Feminine"
        elif "n" in tag:
            gender = "Neutral"
    return gender


# TODO: need to exclude triple character patterns like "p1s": "Past tense, 1st person sng",
def extract_number_from_tag(tag):
    number = None
    if "s" in tag:
        number = "Singular"
    elif "p" in tag:
        number = "Plural"
    return number


def is_masculine(row_dict, i):
    tag = row_dict["tag"].split(",")[i]
    return extract_gender_from_tag(tag) == "Masculine"


def is_singular(row_dict, i):
    tag = row_dict["tag"].split(",")[i]
    return extract_number_from_tag(tag) == "Singular"


def is_masculine_singular(row_dict, i):
    return is_masculine(row_dict, i) and is_singular(row_dict, i)


def is_dep_subject(row_dict, i):
    dep = row_dict["dep"].split(",")[i]
    return dep in {"nsubj", "csubj", "nsubj:pass", "csubj:pass"}


def nlp_get_pos(row_dict, i):
    return row_dict["pos"].split(",")[i]


def nlp_get_dep(row_dict, i):
    return row_dict["dep"].split(",")[i]


def nlp_get_lemma(row_dict, i):
    return row_dict["lemmas"].split(",")[i]


def nlp_get_article(row_dict, i):
    morph = row_dict["morph"].split(",")[i]
    if "Definite=Def" in morph:
        return "definite"
    elif "Definite=Ind" in morph:
        return "indefinite"
    else:
        return ""

In [None]:
# for quick viewing the features
# use updpipe object for the extraction of the word because of performance
# use stanza object for determining features because of accuracy
def inspect_word(row_dict, word_num):
    word = nlp_ud(row_dict["text"])[word_num].text
    pos = row_dict["pos"].split(",")[word_num]
    tag = row_dict["tag"].split(",")[word_num]
    dep = row_dict["dep"].split(",")[word_num]
    gen_s = extract_gender_from_tag(tag)[0:1]  # M - Masculine, F - Feminine, N - Neutral
    num_s = extract_number_from_tag(tag)[0:1]

    return f"{row_dict['index']} {word:<15} POS:{pos:<{10}} Gen:{gen_s}    Num:{num_s}   DEP:{dep:<{10}} Sent: {row_dict["text"]:<30}"

### 6.2 Implement generic testing function

Before we start implementing any rules, we should first create a testing function for them. Since we don‚Äôt have the specific rule functions ready, we‚Äôll set up a general testing function - actually, we‚Äôll create three functions - that can take in a list of conditions and a list of rules. It‚Äôs important that both lists have the same length. The testing function will process text, apply these conditions and rules to detect errors, and then compare the results with expected outcomes, logging whether each test passes or fails.

In [None]:
def all_rules(row_dict, condition_fns, rule_fns):
    # ‚ö†Ô∏èMake sure the sentence always ends with a punctuation. This is a workaround!
    # UDPipe not always separates into tokens correctly, sometimes reports the word+punctuation (or special symbol) as single token.
    # Stanza has bad performance and this function is called iteratively multiple times, not a good idea to use Stanza here.
    sentence = row_dict["text"] + "."
    doc = nlp_ud(sentence)
    errors = []

    # ‚ö†Ô∏èThis is a very ugly workaround. UDPipe does not correctly split into tokens when there is punctuation
    # in the middle of the sentence. We return "no errors" here which might make some tests fail but
    # the goal is not to raise an exception. The correct solution is not to use UDPipe at all and pre-calculate the tokens
    # in the beginning.
    if (len(doc) != row_dict['num_tokens']):
        return errors

    # Iterate over token pairs with indices
    # for i in range(len(doc) - 1):
    for i in range(row_dict['num_tokens'] - 1):
        token1 = doc[i].text
        token2 = doc[i + 1].text

        conditions = [condition_fn(row_dict, i) for condition_fn in condition_fns]

        for condition, rule_fn in zip(conditions, rule_fns):
            if condition:
                errors += rule_fn(token1, token2, i)

    return errors

In [None]:
def test_definite_article(row_dict, condition_fns, rule_fns):
    actual_wrong_words = []
    expected_wrong_words = [word.strip() for word in row_dict["incorrect_words"].split(",") if word.strip() != ""]
    actual_wrong_words = all_rules(row_dict, condition_fns, rule_fns)

    test_res = "Pass" if set(actual_wrong_words) == set(expected_wrong_words) else "Fail"

    return (test_res, actual_wrong_words)

In [None]:
# tests all sentences from a dataset that already contains NLP tags
def test_definite_article_all(nlp_df, conditions_fns, rules_fns, print_passed=True, print_failed=True, print_total=True):
    """Tests definite articles in sentences and logs results."""
    idx_failed, idx_passed = [], []

    for i, row in nlp_df.iterrows():
        row_dict = row.to_dict()
        res = test_definite_article(row_dict, conditions_fns, rules_fns)
        status, incorrect = res[0], res[1]

        if status == "Fail":
            idx_failed.append(i)
            if print_failed:
                print(i, f'‚ùå {row_dict["text"]} (Actual: \'{",".join(incorrect)}\', Expected: \'{row_dict["incorrect_words"]}\')')
        else:
            idx_passed.append(i)
            message = f'‚úÖ {row_dict["text"]} (\'{",".join(incorrect)}\' is incorrect)' if incorrect else f'‚úÖ {row_dict["text"]} (The sentence is correct)'
            if print_passed:
                print(i, message)

    if print_total:
        n_failed = len(idx_failed)
        print("‚úÖ All tests passed" if n_failed == 0 else f"‚ùå {n_failed}/{len(nlp_df)} failed.")

    return idx_passed, idx_failed

### 6.3 Iteration 1

#### 6.3.1 Implement Rule1

According to Rule1, long definite article should be used when the noun takes the role of the subject. Additionally, in order to exclude groups of adjective + noun, which fall under Rule3, we need to exclude noun that follow an adjective.

In [None]:
def cond_rule1(row_dict, i):
    """Searches for a NOUN, masculine, singular, which is the SUBJECT (nominal or clausal) in the sentence,
       and is not preceded by an ADJective"""
    is_token1_adj = nlp_get_pos(row_dict, i) == "ADJ"
    is_token1_noun = nlp_get_pos(row_dict, i) == "NOUN"
    is_token1_masc_sg = is_masculine_singular(row_dict, i)
    is_token1_subj = is_dep_subject(row_dict, i)

    is_token2_noun = nlp_get_pos(row_dict, i + 1) == "NOUN"
    is_token2_masc_sg = is_masculine_singular(row_dict, i + 1)
    is_token2_subj = is_dep_subject(row_dict, i + 1)

    # if token is masculine singular noun and subj, and the previous token is not an adjective
    if i == 0 and is_token1_masc_sg and is_token1_noun and is_token1_subj:
        return True
    if i != 0 and not is_token1_adj and is_token2_masc_sg and is_token2_noun and is_token2_subj:
        return True

    return False

In [None]:
def rule1(token1, token2, token_idx):
    errors = []
    word = token1 if token_idx == 0 else token2
    if not word.endswith(("—ä—Ç", "—è—Ç")):
        errors.append(word)
    return errors

#### 6.3.2 Test Rule1

In [None]:
idx_passed, idx_current = test_definite_article_all(test_set_1, [cond_rule1], [rule1])

0 ‚úÖ –ö–ª—é—á–∞ –µ –Ω–∞ –º–∞—Å–∞—Ç–∞. ('–ö–ª—é—á–∞' is incorrect)
1 ‚úÖ –ö–ª—é—á—ä—Ç –µ –Ω–∞ –º–∞—Å–∞—Ç–∞. (The sentence is correct)
2 ‚úÖ –¶–∞—Ä—è –ø–∏–µ –≤–∏–Ω–æ. ('–¶–∞—Ä—è' is incorrect)
3 ‚úÖ –¶–∞—Ä—è—Ç –ø–∏–µ –≤–∏–Ω–æ. (The sentence is correct)
4 ‚úÖ –£—á–µ–Ω–∏–∫–∞ –µ —É–º–µ–Ω –∏ —Ç—Ä—É–¥–æ–ª—é–±–∏–≤. ('–£—á–µ–Ω–∏–∫–∞' is incorrect)
5 ‚úÖ –£—á–µ–Ω–∏–∫—ä—Ç –µ —É–º–µ–Ω –∏ —Ç—Ä—É–¥–æ–ª—é–±–∏–≤. (The sentence is correct)
6 ‚úÖ –ü—Ä–∏—è—Ç–µ–ª—è –º–∏ –µ –≤ —á—É–∂–±–∏–Ω–∞. ('–ü—Ä–∏—è—Ç–µ–ª—è' is incorrect)
7 ‚úÖ –ü—Ä–∏—è—Ç–µ–ª—è—Ç –º–∏ –µ –≤ —á—É–∂–±–∏–Ω–∞. (The sentence is correct)
8 ‚úÖ –ì–æ—Å—Ç–∞ –ø—Ä–∏—Å—Ç–∏–≥–Ω–∞. ('–ì–æ—Å—Ç–∞' is incorrect)
9 ‚úÖ –ì–æ—Å—Ç—ä—Ç –ø—Ä–∏—Å—Ç–∏–≥–Ω–∞. (The sentence is correct)
10 ‚ùå –ì–æ—Ç–≤–∞—á–∞ –ø—Ä–∏–≥–æ—Ç–≤–∏ –æ–±—è–¥–∞. (Actual: '', Expected: '–ì–æ—Ç–≤–∞—á–∞')
11 ‚úÖ –ì–æ—Ç–≤–∞—á—ä—Ç –ø—Ä–∏–≥–æ—Ç–≤–∏ –æ–±—è–¥–∞. (The sentence is correct)
12 ‚úÖ –í–ª–∞–∫–∞ —Å–ø—Ä—è –Ω–∞ –≥–∞—Ä–∞ –°–æ—Ñ–∏—è. ('–í–ª–∞–∫–∞' is incorrect)
13 ‚úÖ –í–ª–∞–∫—ä—Ç —Å–ø—Ä—è

##### Analyze some of the failures

Now let's ta–∫e the first several failed tests and figure out why they failed.

In [None]:
def row_to_dict(df, index):
    row_dict = df.loc[index].to_dict()
    row_dict["index"] = index
    return row_dict

In [None]:
res = []
res.append(inspect_word(row_to_dict(test_set_1, 10), 0))  # üëâ–ì–æ—Ç–≤–∞—á–∞ –ø—Ä–∏–≥–æ—Ç–≤–∏ –æ–±—è–¥–∞.
res.append(inspect_word(row_to_dict(test_set_1, 16), 2))  # –¢–∞–Ω—è —Ç—ä—Ä—Å–∏ üëâ–ª–µ–∫–∞—Ä—è—Ç.
res.append(inspect_word(row_to_dict(test_set_1, 18), 2))  # –ì–æ–≤–æ—Ä–∏–º –∑–∞ üëâ–ª–µ–∫–∞—Ä—è—Ç.
res.append(inspect_word(row_to_dict(test_set_1, 20), 5))  # –¢–∞–Ω—è —Ç—ä—Ä—Å–∏ –ê–Ω–≥–µ–ª –ò–≤–∞–Ω—á–µ–≤, üëâ–ª–µ–∫–∞—Ä—è—Ç.
res.append(inspect_word(row_to_dict(test_set_1, 22), 5))  # –ì–æ–≤–æ—Ä–∏–º –∑–∞ –ê–Ω–≥–µ–ª –ò–≤–∞–Ω—á–µ–≤, üëâ–ª–µ–∫–∞—Ä—è—Ç.
res.append(inspect_word(row_to_dict(test_set_1, 24), 3))  # –¢–æ–π –º–∏ –¥–∞–¥–µ üëâ–∫–ª—é—á—ä—Ç –æ—Ç –∫—ä—â–∞—Ç–∞.
print("\n".join(res))

10 –ì–æ—Ç–≤–∞—á–∞         POS:NOUN       Gen:F    Num:S   DEP:nsubj      Sent: –ì–æ—Ç–≤–∞—á–∞ –ø—Ä–∏–≥–æ—Ç–≤–∏ –æ–±—è–¥–∞.       
16 –ª–µ–∫–∞—Ä—è—Ç         POS:NOUN       Gen:M    Num:S   DEP:nsubj      Sent: –¢–∞–Ω—è —Ç—ä—Ä—Å–∏ –ª–µ–∫–∞—Ä—è—Ç.           
18 –ª–µ–∫–∞—Ä—è—Ç         POS:NOUN       Gen:M    Num:S   DEP:iobj       Sent: –ì–æ–≤–æ—Ä–∏–º –∑–∞ –ª–µ–∫–∞—Ä—è—Ç.           
20 –ª–µ–∫–∞—Ä—è—Ç         POS:NOUN       Gen:M    Num:S   DEP:nmod       Sent: –¢–∞–Ω—è —Ç—ä—Ä—Å–∏ –ê–Ω–≥–µ–ª –ò–≤–∞–Ω—á–µ–≤, –ª–µ–∫–∞—Ä—è—Ç.
22 –ª–µ–∫–∞—Ä—è—Ç         POS:NOUN       Gen:M    Num:S   DEP:conj       Sent: –ì–æ–≤–æ—Ä–∏–º –∑–∞ –ê–Ω–≥–µ–ª –ò–≤–∞–Ω—á–µ–≤, –ª–µ–∫–∞—Ä—è—Ç.
24 –∫–ª—é—á—ä—Ç          POS:NOUN       Gen:M    Num:S   DEP:obj        Sent: –¢–æ–π –º–∏ –¥–∞–¥–µ –∫–ª—é—á—ä—Ç –æ—Ç –∫—ä—â–∞—Ç–∞. 


The problem with _**–≥–æ—Ç–≤–∞—á–∞**_ is that it is considered to be feminine by Stanza. For now we **will postpone fixing** because without context it is impossible to determine the gender, and without gender we can't apply Rule1.

The word _**–ª–µ–∫–∞—Ä—è—Ç**_ in _–¢–∞–Ω—è —Ç—ä—Ä—Å–∏ –ª–µ–∫–∞—Ä—è—Ç_ is determined as an nsubj (nominal subject) by Stanza but the correct dependency is object. Well, grammatically it would be correct if we assume that the doctor is the doer in the sentence. However, that is the less popular word order. Let's **postpone fixing** this sentence for later.

The word _**–õ–µ–∫–∞—Ä—è—Ç**_ in the sentences \
_–ì–æ–≤–æ—Ä–∏–º –∑–∞ –ª–µ–∫–∞—Ä—è—Ç_, \
_–¢–∞–Ω—è —Ç—ä—Ä—Å–∏ –ê–Ω–≥–µ–ª –ò–≤–∞–Ω—á–µ–≤, –ª–µ–∫–∞—Ä—è—Ç_ \
and _–ì–æ–≤–æ—Ä–∏–º –∑–∞ –ê–Ω–≥–µ–ª –ò–≤–∞–Ω—á–µ–≤, –ª–µ–∫–∞—Ä—è—Ç._ \
is determined to be iobj (indirect object), nmod (nominal modifier) and conj (conjust) so that falls under Rule2.

_**–∫–ª—é—á—ä—Ç**_ is marked as object so it also falls under Rule2.

So the last 4 failures are expected to be fixed after we implement Rule2.

Separate the failed and analyzed tests into one list and the passed tests into another list. The one with the passed tests will be used for regression testing.

In [None]:
idx_passed = sorted(list(set(idx_passed) - set([10, 16])))
idx_current = sorted(list(set(idx_current) - set([10, 16])))

### 6.4 Iteration 2

#### 6.4.1 Implement Rule2

Here we will implement the first part of Rule2, which states that when the noun takes the role of the object, then the short form of the definite article should be used.

In [None]:
def cond_rule2(row_dict, i):
    """Searches for masculine singular nouns that are the object in the sentence"""
    is_token1_noun = nlp_get_pos(row_dict, i) == "NOUN"
    is_token1_masc_sg = is_masculine_singular(row_dict, i)
    is_token1_subj = is_dep_subject(row_dict, i)

    if is_token1_noun and is_token1_masc_sg and not is_token1_subj:
        return True
    return False

In [None]:
def rule2(token1, token2, idx_token):
    errors = []
    if token1.endswith(("—ä—Ç", "—è—Ç")):  # objects should not have full article
        errors.append(token1)
    return errors

#### 6.4.2 Test Rule2

In [None]:
p, c = test_definite_article_all(test_set_1.loc[idx_current], [cond_rule1, cond_rule2], [rule1, rule2])

18 ‚úÖ –ì–æ–≤–æ—Ä–∏–º –∑–∞ –ª–µ–∫–∞—Ä—è—Ç. ('–ª–µ–∫–∞—Ä—è—Ç' is incorrect)
20 ‚úÖ –¢–∞–Ω—è —Ç—ä—Ä—Å–∏ –ê–Ω–≥–µ–ª –ò–≤–∞–Ω—á–µ–≤, –ª–µ–∫–∞—Ä—è—Ç. ('–ª–µ–∫–∞—Ä—è—Ç' is incorrect)
22 ‚úÖ –ì–æ–≤–æ—Ä–∏–º –∑–∞ –ê–Ω–≥–µ–ª –ò–≤–∞–Ω—á–µ–≤, –ª–µ–∫–∞—Ä—è—Ç. ('–ª–µ–∫–∞—Ä—è—Ç' is incorrect)
24 ‚úÖ –¢–æ–π –º–∏ –¥–∞–¥–µ –∫–ª—é—á—ä—Ç –æ—Ç –∫—ä—â–∞—Ç–∞. ('–∫–ª—é—á—ä—Ç' is incorrect)
26 ‚ùå –ü—Ä–∏–Ω—Ü–∞ –≥–æ–≤–æ—Ä–∏ —Å —Ü–∞—Ä—è—Ç. (Actual: '—Ü–∞—Ä—è—Ç', Expected: '–ü—Ä–∏–Ω—Ü–∞,—Ü–∞—Ä—è—Ç')
28 ‚úÖ –ê–∑ –ø–æ–º–∞–≥–∞–º –Ω–∞ —É—á–µ–Ω–∏–∫—ä—Ç. ('—É—á–µ–Ω–∏–∫—ä—Ç' is incorrect)
30 ‚úÖ –ú–Ω–æ–≥–æ –ø—ä—Ç–Ω–∏—Ü–∏ —Å–ª—è–∑–æ—Ö–∞ –æ—Ç –≤–ª–∞–∫—ä—Ç. ('–≤–ª–∞–∫—ä—Ç' is incorrect)
32 ‚úÖ –°–ª—É—à–∞—Ö–º–µ —Å –∏–Ω—Ç–µ—Ä–µ—Å —É—á–∏—Ç–µ–ª—è—Ç. ('—É—á–∏—Ç–µ–ª—è—Ç' is incorrect)
34 ‚ùå –î—ä–ª–≥–æ–æ—á–∞–∫–≤–∞–Ω–∏—è –≥–æ—Å—Ç –¥–æ–π–¥–µ. (Actual: '', Expected: '–î—ä–ª–≥–æ–æ—á–∞–∫–≤–∞–Ω–∏—è')
36 ‚ùå –ò–≤–∞–Ω, –≥–æ—Å—Ç–∞ –æ—Ç –°–æ–ø–æ—Ç, –¥–æ–π–¥–µ. (Actual: '', Expected: '–≥–æ—Å—Ç–∞')
38 ‚ùå –ò–≤–∞–Ω –µ —Å—ä—É—á–µ–Ω–∏–∫–∞ –º–∏

In the previous run rows 18, 20, 22 and 24 failed, now after implementing Rule2 they pass. Additionally, 4 more tests pass.

##### Analyze some of the failures

In [None]:
res = []
res.append(inspect_word(row_to_dict(test_set_1, 26), 0))  # üëâ–ü—Ä–∏–Ω—Ü–∞ –≥–æ–≤–æ—Ä–∏ —Å —Ü–∞—Ä—è—Ç.
res.append(inspect_word(row_to_dict(test_set_1, 34), 0))  # üëâ–î—ä–ª–≥–æ–æ—á–∞–∫–≤–∞–Ω–∏—è –≥–æ—Å—Ç –¥–æ–π–¥–µ.
res.append(inspect_word(row_to_dict(test_set_1, 34), 1))  # –î—ä–ª–≥–æ–æ—á–∞–∫–≤–∞–Ω–∏—è üëâ–≥–æ—Å—Ç –¥–æ–π–¥–µ.
res.append(inspect_word(row_to_dict(test_set_1, 36), 2))  # –ò–≤–∞–Ω, üëâ–≥–æ—Å—Ç–∞ –æ—Ç –°–æ–ø–æ—Ç, –¥–æ–π–¥–µ.
res.append(inspect_word(row_to_dict(test_set_1, 38), 2))  # –ò–≤–∞–Ω –µ üëâ—Å—ä—É—á–µ–Ω–∏–∫–∞ –º–∏.
res.append(inspect_word(row_to_dict(test_set_1, 40), 2))  # –ò–≤–∞–Ω –µ üëâ–¥–æ–±—Ä–∏—è.
res.append(inspect_word(row_to_dict(test_set_1, 42), 2))  # –ò–≤–∞–Ω –µ üëâ—É—Å–ø–µ–ª–∏—è.
print("\n".join(res))

26 –ü—Ä–∏–Ω—Ü–∞          POS:NOUN       Gen:F    Num:S   DEP:nsubj      Sent: –ü—Ä–∏–Ω—Ü–∞ –≥–æ–≤–æ—Ä–∏ —Å —Ü–∞—Ä—è—Ç.        
34 –î—ä–ª–≥–æ–æ—á–∞–∫–≤–∞–Ω–∏—è  POS:ADJ        Gen:M    Num:S   DEP:amod       Sent: –î—ä–ª–≥–æ–æ—á–∞–∫–≤–∞–Ω–∏—è –≥–æ—Å—Ç –¥–æ–π–¥–µ.    
34 –≥–æ—Å—Ç            POS:NOUN       Gen:M    Num:S   DEP:nsubj      Sent: –î—ä–ª–≥–æ–æ—á–∞–∫–≤–∞–Ω–∏—è –≥–æ—Å—Ç –¥–æ–π–¥–µ.    
36 –≥–æ—Å—Ç–∞           POS:NOUN       Gen:M    Num:S   DEP:nmod       Sent: –ò–≤–∞–Ω, –≥–æ—Å—Ç–∞ –æ—Ç –°–æ–ø–æ—Ç, –¥–æ–π–¥–µ.  
38 —Å—ä—É—á–µ–Ω–∏–∫–∞       POS:NOUN       Gen:M    Num:S   DEP:root       Sent: –ò–≤–∞–Ω –µ —Å—ä—É—á–µ–Ω–∏–∫–∞ –º–∏.          
40 –¥–æ–±—Ä–∏—è          POS:ADJ        Gen:M    Num:S   DEP:root       Sent: –ò–≤–∞–Ω –µ –¥–æ–±—Ä–∏—è.                
42 —É—Å–ø–µ–ª–∏—è         POS:VERB       Gen:     Num:S   DEP:root       Sent: –ò–≤–∞–Ω –µ —É—Å–ø–µ–ª–∏—è.               


* _–ü—Ä–∏–Ω—Ü–∞_ - considered feminine, which is wrong, let's postpone fixing for now.
* _–î—ä–ª–≥–æ–æ—á–∞–∫–≤–∞–Ω–∏—è_  - will be fixed by Rule3
* _–≥–æ—Å—Ç_            - also will be fixed by Rule3
* _–≥–æ—Å—Ç–∞_           - add to later (not sure yet which rule to apply)
* _—Å—ä—É—á–µ–Ω–∏–∫–∞_       - add to later (because root)
* _–¥–æ–±—Ä–∏—è_          - add to later (because root)
* _—É—Å–ø–µ–ª–∏—è_         - Stanza thinks this is a VERB, which is wrong, so let's add to the list of "will not fix"

In [None]:
# keep the lists of passed and current tests up to date
idx_failed = [26, 36, 38, 40, 42]
idx_passed = sorted(set(idx_passed + p))
idx_passed = sorted(list(set(idx_passed) - set(idx_failed)))
idx_current = sorted(list(set(idx_current) - set(p)))
idx_current = sorted(list(set(idx_current) - set(idx_failed)))

#### 6.4.3 Regression Test

Test the previously successful sentences to determine if Rule2 caused any breakages. Perform the regression using all rules defined till now (Rule1, Rule2)

In [None]:
_ = test_definite_article_all(test_set_1.loc[idx_passed], [cond_rule1, cond_rule2], [rule1, rule2], print_passed=False)

37 ‚ùå –ò–≤–∞–Ω, –≥–æ—Å—Ç—ä—Ç –æ—Ç –°–æ–ø–æ—Ç, –¥–æ–π–¥–µ. (Actual: '–≥–æ—Å—Ç—ä—Ç', Expected: '')
39 ‚ùå –ò–≤–∞–Ω –µ —Å—ä—É—á–µ–Ω–∏–∫—ä—Ç –º–∏. (Actual: '—Å—ä—É—á–µ–Ω–∏–∫—ä—Ç', Expected: '')
59 ‚ùå –ü—Ä–∏—è—Ç–µ–ª—è—Ç –Ω–∏ –µ –ª–µ–∫–∞—Ä—è—Ç. (Actual: '–ª–µ–∫–∞—Ä—è—Ç', Expected: '')
61 ‚ùå –ê–Ω–≥–µ–ª –ò–≤–∞–Ω—á–µ–≤, –ª–µ–∫–∞—Ä—è—Ç, –∂–∏–≤–µ–µ —Ç—É–∫–∞. (Actual: '–ª–µ–∫–∞—Ä—è—Ç', Expected: '')
63 ‚ùå –ü—Ä–∏—è—Ç–µ–ª—è—Ç –Ω–∏ –µ –ê–Ω–≥–µ–ª –ò–≤–∞–Ω—á–µ–≤, –ª–µ–∫–∞—Ä—è—Ç. (Actual: '–ü—Ä–∏—è—Ç–µ–ª—è—Ç,–ª–µ–∫–∞—Ä—è—Ç', Expected: '')
‚ùå 5/58 failed.


##### Analyze results from the regression

In [None]:
res = []
res.append(inspect_word(row_to_dict(test_set_1, 37), 2))  # –ò–≤–∞–Ω, üëâ–≥–æ—Å—Ç—ä—Ç –æ—Ç –°–æ–ø–æ—Ç, –¥–æ–π–¥–µ.
res.append(inspect_word(row_to_dict(test_set_1, 39), 2))  # –ò–≤–∞–Ω –µ üëâ—Å—ä—É—á–µ–Ω–∏–∫—ä—Ç –º–∏.
res.append(inspect_word(row_to_dict(test_set_1, 59), 3))  # –ü—Ä–∏—è—Ç–µ–ª—è—Ç –Ω–∏ –µ üëâ–ª–µ–∫–∞—Ä—è—Ç.
res.append(inspect_word(row_to_dict(test_set_1, 61), 3))  # –ê–Ω–≥–µ–ª –ò–≤–∞–Ω—á–µ–≤, üëâ–ª–µ–∫–∞—Ä—è—Ç, –∂–∏–≤–µ–µ —Ç—É–∫–∞.
res.append(inspect_word(row_to_dict(test_set_1, 63), 6))  # –ü—Ä–∏—è—Ç–µ–ª—è—Ç –Ω–∏ –µ –ê–Ω–≥–µ–ª –ò–≤–∞–Ω—á–µ–≤, üëâ–ª–µ–∫–∞—Ä—è—Ç.
print("\n".join(res))

37 –≥–æ—Å—Ç—ä—Ç          POS:NOUN       Gen:M    Num:S   DEP:nmod       Sent: –ò–≤–∞–Ω, –≥–æ—Å—Ç—ä—Ç –æ—Ç –°–æ–ø–æ—Ç, –¥–æ–π–¥–µ. 
39 —Å—ä—É—á–µ–Ω–∏–∫—ä—Ç      POS:NOUN       Gen:M    Num:S   DEP:root       Sent: –ò–≤–∞–Ω –µ —Å—ä—É—á–µ–Ω–∏–∫—ä—Ç –º–∏.         
59 –ª–µ–∫–∞—Ä—è—Ç         POS:NOUN       Gen:M    Num:S   DEP:root       Sent: –ü—Ä–∏—è—Ç–µ–ª—è—Ç –Ω–∏ –µ –ª–µ–∫–∞—Ä—è—Ç.       
61 –ª–µ–∫–∞—Ä—è—Ç         POS:NOUN       Gen:M    Num:S   DEP:nmod       Sent: –ê–Ω–≥–µ–ª –ò–≤–∞–Ω—á–µ–≤, –ª–µ–∫–∞—Ä—è—Ç, –∂–∏–≤–µ–µ —Ç—É–∫–∞.
63 –ª–µ–∫–∞—Ä—è—Ç         POS:NOUN       Gen:M    Num:S   DEP:conj       Sent: –ü—Ä–∏—è—Ç–µ–ª—è—Ç –Ω–∏ –µ –ê–Ω–≥–µ–ª –ò–≤–∞–Ω—á–µ–≤, –ª–µ–∫–∞—Ä—è—Ç.


When we implement the next rules these failures should be fixed. For now let's move them to the list with failed tests. We'll get back to them later.

In [None]:
idx_failed = [37, 39, 59, 61, 63]
idx_passed = sorted(list(set(idx_passed) - set(idx_failed)))

### 6.5 Iteration 3

#### 6.5.1 Implement Rule3

Rule3 states that adjectives, numerals, participles and possessive pronouns take the same article as the noun they agree with. Let's first implement the rules for adjectives.

In [None]:
# –ü—ä–ª–µ–Ω —á–ª–µ–Ω —Å–µ –ø–æ—è–≤—è–≤–∞ –≤ —Ü—è–ª–∞—Ç–∞ –≥—Ä—É–ø–∞ –Ω–∞ –ø–æ–¥–ª–æ–≥–∞, –∫—ä–º –∫–æ—è—Ç–æ –ø—Ä–∏–Ω–∞–¥–ª–µ–∂–∞—Ç –Ω–µ–≥–æ–≤–∏—Ç–µ –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è –∏–ª–∏ –ø—Ä–∏–ª–æ–∂–µ–Ω–∏—è. [3]
# In adjective-noun phrases, only the adjective takes a definite article ending.
def cond_rule3(row_dict, i):
    """Searches for ADJ/amod + NOUN/nsubj, both masculine/singular"""
    is_token1_adj = nlp_get_pos(row_dict, i) == "ADJ"
    is_token1_masc_sg = is_masculine_singular(row_dict, i)
    is_token1_amod = nlp_get_dep(row_dict, i) == "amod"

    is_token2_noun = nlp_get_pos(row_dict, i + 1) == "NOUN"
    is_token2_masc_sg = is_masculine_singular(row_dict, i + 1)
    is_token2_nsubj = nlp_get_dep(row_dict, i + 1) == "nsubj"

    if is_token1_adj and is_token2_noun:
        if is_token1_masc_sg and is_token2_masc_sg:
            if is_token1_amod and is_token2_nsubj:
                return True
    return False

In [None]:
def rule3(token1, token2, idx_token):
    errors = []

    # –î—ä–ª–≥–æ–æ—á–∞–∫–≤–∞–Ω–∏—è –≥–æ—Å—Ç
    if token1.endswith(("—è")):  # –ø—Ä–∏–ª–∞–≥–∞—Ç–µ–ª–Ω–æ—Ç–æ —Ç—Ä—è–±–≤–∞ –¥–∞ –µ —Å –ø—ä–ª–µ–Ω —á–ª–µ–Ω
        errors.append(token1)

    # –î—ä–ª–≥–æ–æ—á–∞–∫–≤–∞–Ω–∏—è—Ç –≥–æ—Å—Ç
    if token2.endswith(("—ä—Ç", "—è—Ç")):  # —Å—ä—â–µ—Å—Ç–≤–∏—Ç–µ–ª–Ω–æ—Ç–æ —Ç—Ä—è–±–≤–∞ –¥–∞ –µ —Å –Ω–µ–ø—ä–ª–µ–Ω —á–ª–µ–Ω
        errors.append(token2)

    return errors

#### 6.5.2 Test Rule3

In [None]:
conditions = [cond_rule1, cond_rule2, cond_rule3]
rules = [rule1, rule2, rule3]
p, c = test_definite_article_all(test_set_1.loc[idx_current], conditions, rules)

34 ‚úÖ –î—ä–ª–≥–æ–æ—á–∞–∫–≤–∞–Ω–∏—è –≥–æ—Å—Ç –¥–æ–π–¥–µ. ('–î—ä–ª–≥–æ–æ—á–∞–∫–≤–∞–Ω–∏—è' is incorrect)
44 ‚ùå –ü–µ—à–æ —Å–µ –æ–∫–∞–∑–∞ –Ω–∞–π-–≤–µ—Ä–Ω–∏—è –º—É –ø—Ä–∏—è—Ç–µ–ª. (Actual: '', Expected: '–Ω–∞–π-–≤–µ—Ä–Ω–∏—è')
46 ‚ùå –ü–µ—à–æ —Å–µ –æ–∫–∞–∑–∞ –¥–∞—Ä–∏—Ç–µ–ª—è –Ω–∞ —É—á–∏–ª–∏—â–µ—Ç–æ. (Actual: '', Expected: '–¥–∞—Ä–∏—Ç–µ–ª—è')
48 ‚ùå –£—á–µ–Ω–∏–∫—ä—Ç –Ω–∞ –ø—ä—Ä–≤–∏—è —á–∏–Ω –∏–∑–≥–ª–µ–∂–¥–∞ –Ω–∞–π-–¥–æ–≤–æ–ª–Ω–∏—è –æ—Ç –≤—Å–∏—á–∫–∏. (Actual: '', Expected: '–Ω–∞–π-–¥–æ–≤–æ–ª–Ω–∏—è')
50 ‚ùå –ú–∞–ª–∫–∏—è, –∑–∞–ø–æ–≤—è–¥–∞–π –µ–¥–Ω–æ –±–æ–Ω–±–æ–Ω—á–µ! (Actual: '', Expected: '–ú–∞–ª–∫–∏—è')
58 ‚ùå –ü—Ä–∏—è—Ç–µ–ª—è –Ω–∏ –µ –ª–µ–∫–∞—Ä—è. (Actual: '–ü—Ä–∏—è—Ç–µ–ª—è', Expected: '–ü—Ä–∏—è—Ç–µ–ª—è,–ª–µ–∫–∞—Ä—è')
60 ‚ùå –ê–Ω–≥–µ–ª –ò–≤–∞–Ω—á–µ–≤, –ª–µ–∫–∞—Ä—è, –∂–∏–≤–µ–µ —Ç—É–∫–∞. (Actual: '', Expected: '–ª–µ–∫–∞—Ä—è')
62 ‚ùå –ü—Ä–∏—è—Ç–µ–ª—è –Ω–∏ –µ –ê–Ω–≥–µ–ª –ò–≤–∞–Ω—á–µ–≤, –ª–µ–∫–∞—Ä—è. (Actual: '', Expected: '–ü—Ä–∏—è—Ç–µ–ª—è,–ª–µ–∫–∞—Ä—è')
64 ‚ùå –¢–æ–π –µ –Ω–∞–π-–≤–∏—Å–æ–∫–∏—è –∏

##### Analyze the test

In [None]:
res = []
res.append(inspect_word(row_to_dict(test_set_1, 44), 3))  # –ü–µ—à–æ —Å–µ –æ–∫–∞–∑–∞ üëâ–Ω–∞–π-–≤–µ—Ä–Ω–∏—è –º—É –ø—Ä–∏—è—Ç–µ–ª.
res.append(inspect_word(row_to_dict(test_set_1, 46), 3))  # –ü–µ—à–æ —Å–µ –æ–∫–∞–∑–∞ üëâ–¥–∞—Ä–∏—Ç–µ–ª—è –Ω–∞ —É—á–∏–ª–∏—â–µ—Ç–æ.
res.append(inspect_word(row_to_dict(test_set_1, 48), 5))  # –£—á–µ–Ω–∏–∫—ä—Ç –Ω–∞ –ø—ä—Ä–≤–∏—è —á–∏–Ω –∏–∑–≥–ª–µ–∂–¥–∞ üëâ–Ω–∞–π-–¥–æ–≤–æ–ª–Ω–∏—è –æ—Ç –≤—Å–∏—á–∫–∏.
res.append(inspect_word(row_to_dict(test_set_1, 50), 0))  # üëâ–ú–∞–ª–∫–∏—è, –∑–∞–ø–æ–≤—è–¥–∞–π –µ–¥–Ω–æ –±–æ–Ω–±–æ–Ω—á–µ!
res.append(inspect_word(row_to_dict(test_set_1, 58), 0))  # üëâ–ü—Ä–∏—è—Ç–µ–ª—è –Ω–∏ –µ üëâ–ª–µ–∫–∞—Ä—è.
res.append(inspect_word(row_to_dict(test_set_1, 60), 3))  # –ê–Ω–≥–µ–ª –ò–≤–∞–Ω—á–µ–≤, üëâ–ª–µ–∫–∞—Ä—è, –∂–∏–≤–µ–µ —Ç—É–∫–∞.
res.append(inspect_word(row_to_dict(test_set_1, 62), 0))  # üëâ–ü—Ä–∏—è—Ç–µ–ª—è –Ω–∏ –µ –ê–Ω–≥–µ–ª –ò–≤–∞–Ω—á–µ–≤, üëâ–ª–µ–∫–∞—Ä—è.
print("\n".join(res))

44 –Ω–∞–π-–≤–µ—Ä–Ω–∏—è      POS:ADJ        Gen:M    Num:S   DEP:amod       Sent: –ü–µ—à–æ —Å–µ –æ–∫–∞–∑–∞ –Ω–∞–π-–≤–µ—Ä–Ω–∏—è –º—É –ø—Ä–∏—è—Ç–µ–ª.
46 –¥–∞—Ä–∏—Ç–µ–ª—è        POS:NOUN       Gen:M    Num:S   DEP:obj        Sent: –ü–µ—à–æ —Å–µ –æ–∫–∞–∑–∞ –¥–∞—Ä–∏—Ç–µ–ª—è –Ω–∞ —É—á–∏–ª–∏—â–µ—Ç–æ.
48 –Ω–∞–π-–¥–æ–≤–æ–ª–Ω–∏—è    POS:ADJ        Gen:M    Num:S   DEP:obj        Sent: –£—á–µ–Ω–∏–∫—ä—Ç –Ω–∞ –ø—ä—Ä–≤–∏—è —á–∏–Ω –∏–∑–≥–ª–µ–∂–¥–∞ –Ω–∞–π-–¥–æ–≤–æ–ª–Ω–∏—è –æ—Ç –≤—Å–∏—á–∫–∏.
50 –ú–∞–ª–∫–∏—è          POS:ADJ        Gen:M    Num:S   DEP:vocative   Sent: –ú–∞–ª–∫–∏—è, –∑–∞–ø–æ–≤—è–¥–∞–π –µ–¥–Ω–æ –±–æ–Ω–±–æ–Ω—á–µ!
58 –ü—Ä–∏—è—Ç–µ–ª—è        POS:NOUN       Gen:M    Num:S   DEP:nsubj      Sent: –ü—Ä–∏—è—Ç–µ–ª—è –Ω–∏ –µ –ª–µ–∫–∞—Ä—è.         
60 –ª–µ–∫–∞—Ä—è          POS:NOUN       Gen:M    Num:S   DEP:nmod       Sent: –ê–Ω–≥–µ–ª –ò–≤–∞–Ω—á–µ–≤, –ª–µ–∫–∞—Ä—è, –∂–∏–≤–µ–µ —Ç—É–∫–∞.
62 –ü—Ä–∏—è—Ç–µ–ª—è        POS:NOUN       Gen:M    Num:S   DEP:root       Sent: –ü—Ä–∏—è—Ç–µ–ª—è –Ω–∏ –µ –ê–Ω–≥–µ–ª

* 44 _–Ω–∞–π-–≤–µ—Ä–Ω–∏—è_ - should be fixed by Rule4
* 46 _–¥–∞—Ä–∏—Ç–µ–ª—è_ - should be fixed by Rule4
* 48 _–Ω–∞–π_–¥–æ–≤–æ–ª–Ω–∏—è_ - should be fixed by Rule4
* 50 _–ú–∞–ª–∫–∏—è_ - should be fixed by Rule_6
* 58 _–ü—Ä–∏—è—Ç–µ–ª—è_ on row 58 - should be fixed by Rule_7
* 60 _–ª–µ–∫–∞—Ä—è_ - should be fixed by Rule_7
* 62 _–ü—Ä–∏—è—Ç–µ–ª—è_ on row 62 - should be fixed by Rule_7

In [None]:
# keep the lists with passing and current tests up to date
idx_failed = [50, 58, 60, 62]
idx_passed = sorted(set(idx_passed + p))
idx_passed = sorted(list(set(idx_passed) - set(idx_failed)))
idx_current = sorted(list(set(idx_current) - set(p)))
idx_current = sorted(list(set(idx_current) - set(idx_failed)))

#### 6.5.3 Regression Test

Ensure that we have not violated Rule 1 and Rule 2:

In [None]:
_ = test_definite_article_all(test_set_1.loc[idx_passed], conditions, rules, print_passed=False)

‚úÖ All tests passed


### 6.6 Iteration 4

#### 6.6.1 Implement Rule4

According to Rule4, the full definite article should be used when a noun is after verbs like _—Å—ä–º_, _–±—ä–¥–∞_, _–æ–∫–∞–∑–≤–∞–º —Å–µ_, _–∏–∑–≥–ª–µ–∂–¥–∞–º_, etc. In order to check the previous verb, we need to check the _lemma_ of the verb.

First let's see what the previously failed sentences look like.

In [None]:
inspect_spacy_doc(nlp_st(test_set_1.loc[39, "text"]))
inspect_spacy_doc(nlp_st(test_set_1.loc[44, "text"]))
inspect_spacy_doc(nlp_st(test_set_1.loc[46, "text"]))
inspect_spacy_doc(nlp_st(test_set_1.loc[48, "text"]))

Token: –ò–≤–∞–Ω            Tag: Npmsi           POS: PROPN      Lemma: –∏–≤–∞–Ω            Dep: nsubj     
Token: –µ               Tag: Vxitf-r3s       POS: AUX        Lemma: —Å—ä–º             Dep: cop       
Token: —Å—ä—É—á–µ–Ω–∏–∫—ä—Ç      Tag: Ncmsf           POS: NOUN       Lemma: —Å—ä—É—á–µ–Ω–∏–∫        Dep: root      
Token: –º–∏              Tag: Psot--1         POS: PRON       Lemma: –∞–∑              Dep: det       
Token: .               Tag: punct           POS: PUNCT      Lemma: .               Dep: punct     


Token: –ü–µ—à–æ            Tag: Npmsi           POS: PROPN      Lemma: –ø–µ—à–æ            Dep: nsubj     
Token: —Å–µ              Tag: Ppxta           POS: PRON       Lemma: —Å–µ              Dep: expl      
Token: –æ–∫–∞–∑–∞           Tag: Vpptf-o3s       POS: VERB       Lemma: –æ–∫–∞–∂–∞-(—Å–µ)      Dep: root      
Token: –Ω–∞–π-–≤–µ—Ä–Ω–∏—è      Tag: Amsh            POS: ADJ        Lemma: –≤–µ—Ä–µ–Ω           Dep: amod      
Token: –º—É              Tag: Psot-

<img src="images/rule4_pattern_new.png" width="800">

The above pattern shows that the first token is a verb (including auxiliary), while the second token is either a noun or an adjective, both of which must be in masculine singular form.

In [None]:
# –ü—ä–ª–µ–Ω –µ —á–ª–µ–Ω—ä—Ç –∏ –Ω–∞ –∏–º–µ—Ç–æ, —É–ø–æ—Ç—Ä–µ–±–µ–Ω–æ —Å–ª–µ–¥ –≥–ª–∞–≥–æ–ª–∏ –∫–∞—Ç–æ —Å—ä–º, –±—ä–¥–∞, –æ–∫–∞–∑–≤–∞–º —Å–µ, –∏–∑–≥–ª–µ–∂–¥–∞–º –∏ –¥—Ä. [3]
def cond_rule4(row_dict, i):
    is_token1_aux = nlp_get_pos(row_dict, i) == "AUX"
    is_token1_verb = nlp_get_pos(row_dict, i) == "VERB"

    is_token2_noun = nlp_get_pos(row_dict, i + 1) == "NOUN"
    is_token2_adj = nlp_get_pos(row_dict, i + 1) == "ADJ"
    is_token2_masc_sg = is_masculine_singular(row_dict, i + 1)
    is_article_indefinite = nlp_get_article(row_dict, i + 1) == "indefinite"

    is_lemma_aux_syn = False

    if is_token2_masc_sg:
        if is_token2_noun or (is_token2_adj and not is_article_indefinite):
            if is_token1_aux or is_token1_verb:
                doc = nlp_st(row_dict["text"])
                lemma = doc[i].lemma_
                # –í–∏–Ω–∞–≥–∏ —Å–ª–µ–¥ –≥–ª–∞–≥–æ–ª–∞ ‚Äû—Å—ä–º‚Äú –∏–ª–∏ —Å–ª–µ–¥ –≥–ª–∞–≥–æ–ª–∏, –∫–æ–∏—Ç–æ —Å–∞ —Å –ø—Ä–µ–Ω–æ—Å–Ω–∞ —É–ø–æ—Ç—Ä–µ–±–∞ –∏ –º–æ–∂–µ –¥–∞ –±—ä–¥–∞—Ç —Å–∏–Ω–æ–Ω–∏–º–∏ –Ω–∞
                # –≥–ª–∞–≥–æ–ª–∞ ‚Äû—Å—ä–º‚Äú (–æ–∫–∞–∑–∞ —Å–µ, –∏–∑–≥–ª–µ–∂–¥–∞, –∫–∞–∑–≤–∞–º —Å–µ), —Å–µ –ø–∏—à–µ –ø—ä–ª–µ–Ω —á–ª–µ–Ω (-—ä—Ç/-—è—Ç)
                is_lemma_aux_syn = lemma in ("–µ", "—Å—ä–º", "–æ–∫–∞–∂–∞-(—Å–µ)", "–∏–∑–≥–ª–µ–∂–¥–∞")
                if is_lemma_aux_syn:
                    return True
    return False

In [None]:
def rule4(token1, token2, idx_token):
    errors = []

    # –µ        —Å—ä—É—á–µ–Ω–∏–∫—ä—Ç
    # –æ–∫–∞–∑–∞    –Ω–∞–π_–≤–µ—Ä–Ω–∏—è
    # –æ–∫–∞–∑–∞    –¥–∞—Ä–∏—Ç–µ–ª—è
    # –∏–∑–≥–ª–µ–∂–¥–∞ –Ω–∞–π-–¥–æ–≤–æ–ª–Ω–∏—è
    if not token2.endswith(("—ä—Ç", "—è—Ç")):  # —Å–ª–µ–¥ –≥–ª–∞–≥–æ–ª –¥—É–º–∞—Ç–∞ —Ç—Ä—è–±–≤–∞ –¥–∞ –µ —Å –ø—ä–ª–µ–Ω –æ–ø—Ä. —á–ª–µ–Ω
        errors.append(token2)

    return errors

#### 6.6.2 Test Rule4

In [None]:
conditions = [cond_rule1, cond_rule2, cond_rule3, cond_rule4]
rules = [rule1, rule2, rule3, rule4]

p, c = test_definite_article_all(test_set_1.loc[[39, 44, 46, 48]], conditions, rules)

39 ‚ùå –ò–≤–∞–Ω –µ —Å—ä—É—á–µ–Ω–∏–∫—ä—Ç –º–∏. (Actual: '—Å—ä—É—á–µ–Ω–∏–∫—ä—Ç', Expected: '')
44 ‚úÖ –ü–µ—à–æ —Å–µ –æ–∫–∞–∑–∞ –Ω–∞–π-–≤–µ—Ä–Ω–∏—è –º—É –ø—Ä–∏—è—Ç–µ–ª. ('–Ω–∞–π-–≤–µ—Ä–Ω–∏—è' is incorrect)
46 ‚úÖ –ü–µ—à–æ —Å–µ –æ–∫–∞–∑–∞ –¥–∞—Ä–∏—Ç–µ–ª—è –Ω–∞ —É—á–∏–ª–∏—â–µ—Ç–æ. ('–¥–∞—Ä–∏—Ç–µ–ª—è' is incorrect)
48 ‚úÖ –£—á–µ–Ω–∏–∫—ä—Ç –Ω–∞ –ø—ä—Ä–≤–∏—è —á–∏–Ω –∏–∑–≥–ª–µ–∂–¥–∞ –Ω–∞–π-–¥–æ–≤–æ–ª–Ω–∏—è –æ—Ç –≤—Å–∏—á–∫–∏. ('–Ω–∞–π-–¥–æ–≤–æ–ª–Ω–∏—è' is incorrect)
‚ùå 1/4 failed.


##### Analyze

We expected row 39 to pass but it is still failing. This is likely due to one of the existing rules. We should run tests on each rule individually to identify which one is causing the issue, specifically the one that returns the word _—Å—ä—É—á–µ–Ω–∏–∫—ä—Ç_.

In [None]:
p, c = test_definite_article_all(test_set_1.loc[[39]], [cond_rule1], [rule1], print_passed=False)

‚úÖ All tests passed


In [None]:
p, c = test_definite_article_all(test_set_1.loc[[39]], [cond_rule2], [rule2], print_passed=False)

39 ‚ùå –ò–≤–∞–Ω –µ —Å—ä—É—á–µ–Ω–∏–∫—ä—Ç –º–∏. (Actual: '—Å—ä—É—á–µ–Ω–∏–∫—ä—Ç', Expected: '')
‚ùå 1/1 failed.


Row 39 doesn't pass due to Rule2. In Rule4 we checked whether the previous token was AUX or a VERB, so in Rule2 we need to introduce an exception that specifies the previous token cannot be a VERB or AUX. Otherwise we will have two rules acting on the same pattern.

#### 6.6.3 Update Rule2 and test again

In [None]:
# keep the old rule if we need to test with it
cond_rule2_prev = cond_rule2
rule2_prev = rule2


def cond_rule2(row_dict, i):
    """Search for either:
        - single, masculine NOUN, not subject, not preceded by a verb
        - preceding token is a verb but different from —Å—ä–º, –æ–∫–∞–∑–≤–∞–º —Å–µ, –∏–∑–ª–≥–µ–∂–¥–∞, etc."""
    is_token1_noun = nlp_get_pos(row_dict, i) == "NOUN"
    is_token1_verb = nlp_get_pos(row_dict, i) == "VERB"
    is_token1_aux = nlp_get_pos(row_dict, i) == "AUX"
    is_token1_masc_sg = is_masculine_singular(row_dict, i)
    is_token1_subj = is_dep_subject(row_dict, i)

    is_token2_noun = nlp_get_pos(row_dict, i + 1) == "NOUN"
    is_token2_subj = is_dep_subject(row_dict, i + 1)
    is_token2_masc_sg = is_masculine_singular(row_dict, i + 1)

    # if token is masculine single noun and !subj, and the previous token is not a verb
    if i == 0 and is_token1_noun and is_token1_masc_sg and not is_token1_subj:
        return True

    is_like_verb = is_token1_verb or is_token1_aux

    if i != 0 and is_token2_noun and is_token2_masc_sg and not is_token2_subj:
        lemma = nlp_get_lemma(row_dict, i)

        is_lemma_aux_syn = lemma in ("–µ", "—Å—ä–º", "–æ–∫–∞–∂–∞-(—Å–µ)", "–∏–∑–≥–ª–µ–∂–¥–∞")
        is_non_aux_syn = not is_like_verb or (is_like_verb and not is_lemma_aux_syn)
        if is_non_aux_syn:
            return True

    return False

In [None]:
def rule2(token1, token2, token_idx):
    errors = []
    word = token1 if token_idx == 0 else token2
    if word.endswith(("—ä—Ç", "—è—Ç")):  # objects should not have full article
        errors.append(word)

    return errors

First test with a small set:

In [None]:
conditions = [cond_rule1, cond_rule2, cond_rule3, cond_rule4]
rules = [rule1, rule2, rule3, rule4]
p, c = test_definite_article_all(test_set_1.loc[[39, 44, 46, 48]], conditions, rules)

39 ‚úÖ –ò–≤–∞–Ω –µ —Å—ä—É—á–µ–Ω–∏–∫—ä—Ç –º–∏. (The sentence is correct)
44 ‚úÖ –ü–µ—à–æ —Å–µ –æ–∫–∞–∑–∞ –Ω–∞–π-–≤–µ—Ä–Ω–∏—è –º—É –ø—Ä–∏—è—Ç–µ–ª. ('–Ω–∞–π-–≤–µ—Ä–Ω–∏—è' is incorrect)
46 ‚úÖ –ü–µ—à–æ —Å–µ –æ–∫–∞–∑–∞ –¥–∞—Ä–∏—Ç–µ–ª—è –Ω–∞ —É—á–∏–ª–∏—â–µ—Ç–æ. ('–¥–∞—Ä–∏—Ç–µ–ª—è' is incorrect)
48 ‚úÖ –£—á–µ–Ω–∏–∫—ä—Ç –Ω–∞ –ø—ä—Ä–≤–∏—è —á–∏–Ω –∏–∑–≥–ª–µ–∂–¥–∞ –Ω–∞–π-–¥–æ–≤–æ–ª–Ω–∏—è –æ—Ç –≤—Å–∏—á–∫–∏. ('–Ω–∞–π-–¥–æ–≤–æ–ª–Ω–∏—è' is incorrect)
‚úÖ All tests passed


Next test with the remaining rows:

In [None]:
p, c = test_definite_article_all(test_set_1.loc[idx_current], conditions, rules)

44 ‚úÖ –ü–µ—à–æ —Å–µ –æ–∫–∞–∑–∞ –Ω–∞–π-–≤–µ—Ä–Ω–∏—è –º—É –ø—Ä–∏—è—Ç–µ–ª. ('–Ω–∞–π-–≤–µ—Ä–Ω–∏—è' is incorrect)
46 ‚úÖ –ü–µ—à–æ —Å–µ –æ–∫–∞–∑–∞ –¥–∞—Ä–∏—Ç–µ–ª—è –Ω–∞ —É—á–∏–ª–∏—â–µ—Ç–æ. ('–¥–∞—Ä–∏—Ç–µ–ª—è' is incorrect)
48 ‚úÖ –£—á–µ–Ω–∏–∫—ä—Ç –Ω–∞ –ø—ä—Ä–≤–∏—è —á–∏–Ω –∏–∑–≥–ª–µ–∂–¥–∞ –Ω–∞–π-–¥–æ–≤–æ–ª–Ω–∏—è –æ—Ç –≤—Å–∏—á–∫–∏. ('–Ω–∞–π-–¥–æ–≤–æ–ª–Ω–∏—è' is incorrect)
64 ‚úÖ –¢–æ–π –µ –Ω–∞–π-–≤–∏—Å–æ–∫–∏—è –∏ —Ö—É–±–∞–≤ –≤ —Å—Ç–∞—è—Ç–∞. ('–Ω–∞–π-–≤–∏—Å–æ–∫–∏—è' is incorrect)
66 ‚ùå –ê–∑ –∂–∏–≤–µ—è –≤ –Ω–æ–≤–∏—è—Ç –±—è–ª –±–ª–æ–∫. (Actual: '', Expected: '–Ω–æ–≤–∏—è—Ç')
68 ‚ùå –í–∏—Å–æ–∫–∏—è –±—è–ª –±–ª–æ–∫ –µ –Ω–æ–≤. (Actual: '', Expected: '–í–∏—Å–æ–∫–∏—è')
72 ‚ùå –£—á–µ–Ω–∏–∫—ä—Ç, –∞ –Ω–µ —É—á–∏—Ç–µ–ª—è—Ç –∏–∑–≤–∏–∫–∞ –ø—Ä–∏ —Å–µ–±–µ —Å–∏ –¥–∏—Ä–µ–∫—Ç–æ—Ä—ä—Ç. (Actual: '—É—á–∏—Ç–µ–ª—è—Ç,–¥–∏—Ä–µ–∫—Ç–æ—Ä—ä—Ç', Expected: '–£—á–µ–Ω–∏–∫—ä—Ç,—É—á–∏—Ç–µ–ª—è—Ç')
73 ‚ùå –£—á–µ–Ω–∏–∫–∞, –∞ –Ω–µ —É—á–∏—Ç–µ–ª—è –∏–∑–≤–∏–∫–∞ –ø—Ä–∏ —Å–µ–±–µ —Å–∏ –¥–∏—Ä–µ–∫—Ç–æ—Ä—ä—Ç. (Actual: '–£—á–µ–Ω–∏–∫–∞,–¥–∏—Ä–µ

##### Analyze the remaining failures

In [None]:
inspect_spacy_doc(nlp_st(test_set_1.loc[66, "text"]))
inspect_spacy_doc(nlp_st(test_set_1.loc[68, "text"]))

Token: –ê–∑              Tag: Ppe-os1         POS: PRON       Lemma: –∞–∑              Dep: nsubj     
Token: –∂–∏–≤–µ—è           Tag: Vpitf-r1s       POS: VERB       Lemma: –∂–∏–≤–µ—è           Dep: root      
Token: –≤               Tag: R               POS: ADP        Lemma: –≤               Dep: case      
Token: –Ω–æ–≤–∏—è—Ç          Tag: Amsf            POS: ADJ        Lemma: –Ω–æ–≤             Dep: amod      
Token: –±—è–ª             Tag: Amsi            POS: ADJ        Lemma: –±—è–ª             Dep: amod      
Token: –±–ª–æ–∫            Tag: Ncmsi           POS: NOUN       Lemma: –±–ª–æ–∫            Dep: iobj      
Token: .               Tag: punct           POS: PUNCT      Lemma: .               Dep: punct     


Token: –í–∏—Å–æ–∫–∏—è         Tag: Amsh            POS: ADJ        Lemma: –≤–∏—Å–æ–∫           Dep: amod      
Token: –±—è–ª             Tag: Amsi            POS: ADJ        Lemma: –±—è–ª             Dep: amod      
Token: –±–ª–æ–∫            Tag: Ncmsi           PO

* 66 _–Ω–æ–≤–∏—è—Ç –±—è–ª –±–ª–æ–∫_ and 68 _–í–∏—Å–æ–∫–∏—è –±—è–ª –±–ª–æ–∫_ - the pattern ADJ/ADJ/NOUN points to Rule5.
* 72 and 73 depend on the order of words. We don't know who is the doer of the action in those sentences. Let's postpone fixing them for now.

In [None]:
idx_failed = [72, 73]
idx_passed = sorted(set(idx_passed + p))
idx_passed = sorted(list(set(idx_passed) - set(idx_failed)))
idx_current = sorted(list(set(idx_current) - set(p)))
idx_current = sorted(list(set(idx_current) - set(idx_failed)))

#### 6.6.4 Regression Test

In [None]:
_ = test_definite_article_all(test_set_1.loc[idx_passed], conditions, rules, print_passed=False)

‚úÖ All tests passed


### 6.7 Iteration 5

#### 6.7.1 Implement Rule5

Rule5 says that when there are two or more adjectives in front of the noun, only the first adjective takes the definite article, which could be short or long. In the example with rows 66 and 68 which were analyzed in the previous section, we see that the two phrases _–Ω–æ–≤–∏—è—Ç –±—è–ª –±–ª–æ–∫_ and _–≤–∏—Å–æ–∫–∏—è –±—è–ª –±–ª–æ–∫_ which are identical from grammatical standpoint, cannot be distinguished just by looking at their part of speech. We need to also analyze their relation to the entire sentence. The difference here is that in the first case Stanza correctly determined the noun _–±–ª–æ–∫_ as an indirect object, therefore requiring the short form of the definite article, and the same word in the second sentence as nominal subject, requiring full definite article.

<img src="images/rule7_pattern_new.png" width="650">

Although we expressed the condition verbally as one rule, we need to create two sets of functions for the cases of short and long definite article.

In [None]:
# The adjectives, as well as the pronouns, the ordinal numerals etc., used as attributes in the sentence
# are usually placed in front of the nouns they qualify. In this case, the definite article, if needed,
# is joined to the _first_ attribute of the noun phrase. [4],[5]
def cond_rule5_helper(row_dict, i):
    """
    POS: ADJ / ADJ / NOUN iobj,  all masc. singular -> first ADJ must have def. art. short form
    - or -
    POS: ADJ / ADJ / NOUN nsubj, all masc. singular -> first ADJ must have def. art. long form
    """
    if i >= row_dict["num_tokens"] - 2:
        return False

    is_token1_masc_sg = is_masculine_singular(row_dict, i)
    is_token1_adj = nlp_get_pos(row_dict, i) == "ADJ"

    is_token2_masc_sg = is_masculine_singular(row_dict, i + 1)
    is_token2_adj = nlp_get_pos(row_dict, i + 1) == "ADJ"

    is_token3_masc_sg = is_masculine_singular(row_dict, i + 2)
    is_token3_noun = nlp_get_pos(row_dict, i + 2) == "NOUN"

    if is_token1_masc_sg and is_token2_masc_sg and is_token3_masc_sg:
        if is_token1_adj and is_token2_adj and is_token3_noun:
            return True

    return False

In [None]:
def cond_rule5_short(row_dict, i):
    if cond_rule5_helper(row_dict, i):
        noun_dep = nlp_get_dep(row_dict, i + 2)
        if noun_dep in ("iobj"):
            return True

    return False

In [None]:
def cond_rule5_long(row_dict, i):
    if cond_rule5_helper(row_dict, i):
        noun_dep = nlp_get_dep(row_dict, i + 2)
        if noun_dep in ("nsubj"):
            return True

    return False

In [None]:
def rule5_short(token1, token2, token_idx):
    errors = []

    # –ê–∑ –∂–∏–≤–µ—è –≤ –Ω–æ–≤–∏—è—Ç –±—è–ª –±–ª–æ–∫.
    if not token1.endswith(("—è")):  # –ø—ä—Ä–≤–æ—Ç–æ –ø—Ä–∏–ª–∞–≥–∞—Ç–µ–ª–Ω–æ –≤ –≥—Ä—É–ø–∞—Ç–∞ —Ç—Ä—è–±–≤–∞ –¥–∞ –µ —Å –Ω–µ–ø—ä–ª–µ–Ω —á–ª–µ–Ω
        errors.append(token1)

    return errors

In [None]:
def rule5_long(token1, token2, token_idx):
    errors = []

    # –í–∏—Å–æ–∫–∏—è –±—è–ª –±–ª–æ–∫ –µ –Ω–æ–≤.
    if not token1.endswith(("—è—Ç")):  # –ø—ä—Ä–≤–æ—Ç–æ –ø—Ä–∏–ª–∞–≥–∞—Ç–µ–ª–Ω–æ –≤ –≥—Ä—É–ø–∞—Ç–∞ —Ç—Ä—è–±–≤–∞ –¥–∞ –µ —Å –ø—ä–ª–µ–Ω —á–ª–µ–Ω
        errors.append(token1)

    return errors

#### 6.7.2 Test Rule5

In [None]:
conditions = [cond_rule1, cond_rule2, cond_rule3, cond_rule4, cond_rule5_short, cond_rule5_long]
rules = [rule1, rule2, rule3, rule4, rule5_short, rule5_long]
p, c = test_definite_article_all(test_set_1.loc[idx_current], conditions, rules)

66 ‚úÖ –ê–∑ –∂–∏–≤–µ—è –≤ –Ω–æ–≤–∏—è—Ç –±—è–ª –±–ª–æ–∫. ('–Ω–æ–≤–∏—è—Ç' is incorrect)
68 ‚úÖ –í–∏—Å–æ–∫–∏—è –±—è–ª –±–ª–æ–∫ –µ –Ω–æ–≤. ('–í–∏—Å–æ–∫–∏—è' is incorrect)
‚úÖ All tests passed


#### 6.7.3 Regression test

In [None]:
_ = test_definite_article_all(test_set_1.loc[idx_passed], conditions, rules, print_passed=False)

‚úÖ All tests passed


#### 6.7.4 Final test with the whole set

In [None]:
_ = test_definite_article_all(test_set_1, conditions, rules)

0 ‚úÖ –ö–ª—é—á–∞ –µ –Ω–∞ –º–∞—Å–∞—Ç–∞. ('–ö–ª—é—á–∞' is incorrect)
1 ‚úÖ –ö–ª—é—á—ä—Ç –µ –Ω–∞ –º–∞—Å–∞—Ç–∞. (The sentence is correct)
2 ‚úÖ –¶–∞—Ä—è –ø–∏–µ –≤–∏–Ω–æ. ('–¶–∞—Ä—è' is incorrect)
3 ‚úÖ –¶–∞—Ä—è—Ç –ø–∏–µ –≤–∏–Ω–æ. (The sentence is correct)
4 ‚úÖ –£—á–µ–Ω–∏–∫–∞ –µ —É–º–µ–Ω –∏ —Ç—Ä—É–¥–æ–ª—é–±–∏–≤. ('–£—á–µ–Ω–∏–∫–∞' is incorrect)
5 ‚úÖ –£—á–µ–Ω–∏–∫—ä—Ç –µ —É–º–µ–Ω –∏ —Ç—Ä—É–¥–æ–ª—é–±–∏–≤. (The sentence is correct)
6 ‚úÖ –ü—Ä–∏—è—Ç–µ–ª—è –º–∏ –µ –≤ —á—É–∂–±–∏–Ω–∞. ('–ü—Ä–∏—è—Ç–µ–ª—è' is incorrect)
7 ‚úÖ –ü—Ä–∏—è—Ç–µ–ª—è—Ç –º–∏ –µ –≤ —á—É–∂–±–∏–Ω–∞. (The sentence is correct)
8 ‚úÖ –ì–æ—Å—Ç–∞ –ø—Ä–∏—Å—Ç–∏–≥–Ω–∞. ('–ì–æ—Å—Ç–∞' is incorrect)
9 ‚úÖ –ì–æ—Å—Ç—ä—Ç –ø—Ä–∏—Å—Ç–∏–≥–Ω–∞. (The sentence is correct)
10 ‚ùå –ì–æ—Ç–≤–∞—á–∞ –ø—Ä–∏–≥–æ—Ç–≤–∏ –æ–±—è–¥–∞. (Actual: '', Expected: '–ì–æ—Ç–≤–∞—á–∞')
11 ‚úÖ –ì–æ—Ç–≤–∞—á—ä—Ç –ø—Ä–∏–≥–æ—Ç–≤–∏ –æ–±—è–¥–∞. (The sentence is correct)
12 ‚úÖ –í–ª–∞–∫–∞ —Å–ø—Ä—è –Ω–∞ –≥–∞—Ä–∞ –°–æ—Ñ–∏—è. ('–í–ª–∞–∫–∞' is incorrect)
13 ‚úÖ –í–ª–∞–∫—ä—Ç —Å–ø—Ä—è

We didn't predict correctly the usage of the definite article in 13 out of the 78 sentences. In some cases it is due to the NLP tag being incorrect, for example the case where the gender of the noun was tagged wrongly by Stanza. In other case it was because we still haven't implemented Rule7.

## 7 Test with real data

We've initially evaluated our rules using brief, uncomplicated sentences. These were intentionally kept short to clearly demonstrate each specific rule. However, real-world language usage typically involves lengthier and more intricate sentences. It's now necessary to apply our rules to authentic texts to assess their effectiveness.

We will work with the dataset [bulgarian-grammar-mistakes](https://huggingface.co/datasets/thebogko/bulgarian-grammar-mistakes) from huggingface. The data was originally collected from articles from Bulgarian Wikipedia as well as rows from OSCAR's Bulgarian datasets.

### 7.1 Load and prepare the dataset

In [None]:
grammar_errors = pd.read_csv("data/grammar_errors_original.csv")

In [None]:
grammar_errors.shape

(7587, 3)

In [None]:
grammar_errors.head(5)

Unnamed: 0,error_type,erroneous,correct
0,article_misuse,–û—Ç –∫–∞–∫–≤–æ –±–µ—à–µ –Ω–∞–ø—Ä–∞–≤–µ–Ω –≤—Ö–æ–¥–∞ –Ω–∞ –¥–≤–æ—Ä–∞ –Ω–∞ —Å–∫–∏–Ω–∏...,–û—Ç –∫–∞–∫–≤–æ –±–µ—à–µ –Ω–∞–ø—Ä–∞–≤–µ–Ω –≤—Ö–æ–¥—ä—Ç –Ω–∞ –¥–≤–æ—Ä–∞ –Ω–∞ —Å–∫–∏–Ω...
1,article_misuse,"–¢–∞–Ω–µ–≤–∞ –µ –ø—Ä–µ–¥—É–ø—Ä–µ–¥–∏–ª–∞, —á–µ –¥–æ–∫—É–º–µ–Ω—Ç–∞ —â–µ —Å–µ –æ—Ç—Ä–∞...","–¢–∞–Ω–µ–≤–∞ –µ –ø—Ä–µ–¥—É–ø—Ä–µ–¥–∏–ª–∞, —á–µ –¥–æ–∫—É–º–µ–Ω—Ç—ä—Ç —â–µ —Å–µ –æ—Ç—Ä..."
2,article_misuse,–ü–∞—Ç–æ–≥–µ–Ω–µ—Ç–∏—á–Ω–∏—è—Ç –º–µ—Ö–∞–Ω–∏–∑—ä–º –Ω–∞ —Ä–∞–∑–≤–∏—Ç–∏–µ—Ç–æ –Ω–∞ —Ö–∏–ø...,–ü–∞—Ç–æ–≥–µ–Ω–µ—Ç–∏—á–Ω–∏—è—Ç –º–µ—Ö–∞–Ω–∏–∑—ä–º –Ω–∞ —Ä–∞–∑–≤–∏—Ç–∏–µ—Ç–æ –Ω–∞ —Ö–∏–ø...
3,article_misuse,–ü—Ä–µ–∑–∏–¥–µ–Ω—Ç–∞ –∏ –ë–°–ü –æ—Å—ä–¥–∏—Ö–∞ –µ–∫—Å—Ç—Ä–µ–º–∏–∑–º–∞ –∏ –µ–∑–∏–∫–∞ –Ω...,–ü—Ä–µ–∑–∏–¥–µ–Ω—Ç—ä—Ç –∏ –ë–°–ü –æ—Å—ä–¥–∏—Ö–∞ –µ–∫—Å—Ç—Ä–µ–º–∏–∑–º–∞ –∏ –µ–∑–∏–∫–∞ ...
4,article_misuse,–ü–æ–¥ –¥–∞—Ä–µ–Ω–∏–µ—Ç–æ —Å—Ç–æ—è—Ç –∏–º–µ—Ç–æ –∏ –ø–æ–¥–ø–∏—Å–∞ –Ω–∞ –ø—Ä–æ—Å–≤–µ—Ç...,–ü–æ–¥ –¥–∞—Ä–µ–Ω–∏–µ—Ç–æ —Å—Ç–æ—è—Ç –∏–º–µ—Ç–æ –∏ –ø–æ–¥–ø–∏—Å—ä—Ç –Ω–∞ –ø—Ä–æ—Å–≤–µ...


#### 7.1.1 Filter only errors related to article misuse:

In [None]:
grammar_errors.error_type.unique()

array(['article_misuse', 'pronoun_misuse', 'incorrect_verb_suffix_me',
       'noun_adjective_disagreement'], dtype=object)

In [None]:
grammar_errors = grammar_errors[grammar_errors["error_type"] == "article_misuse"]

In [None]:
grammar_errors.shape

(2349, 3)

#### 7.1.2 Rename column headings

In [None]:
grammar_errors = grammar_errors.rename(columns={"erroneous": "incorrect"})

#### 7.1.3 Add a column with differing words

In [None]:
grammar_errors[['incorrect', 'correct']] = grammar_errors[['incorrect', 'correct']].map(str.strip)
grammar_errors[["correct_words", "incorrect_words"]] = grammar_errors.apply(get_differing_words, axis=1, result_type="expand")

In [None]:
grammar_errors.head(3)

Unnamed: 0,error_type,incorrect,correct,correct_words,incorrect_words
0,article_misuse,–û—Ç –∫–∞–∫–≤–æ –±–µ—à–µ –Ω–∞–ø—Ä–∞–≤–µ–Ω –≤—Ö–æ–¥–∞ –Ω–∞ –¥–≤–æ—Ä–∞ –Ω–∞ —Å–∫–∏–Ω–∏...,–û—Ç –∫–∞–∫–≤–æ –±–µ—à–µ –Ω–∞–ø—Ä–∞–≤–µ–Ω –≤—Ö–æ–¥—ä—Ç –Ω–∞ –¥–≤–æ—Ä–∞ –Ω–∞ —Å–∫–∏–Ω...,–≤—Ö–æ–¥–∞,–≤—Ö–æ–¥—ä—Ç
1,article_misuse,"–¢–∞–Ω–µ–≤–∞ –µ –ø—Ä–µ–¥—É–ø—Ä–µ–¥–∏–ª–∞, —á–µ –¥–æ–∫—É–º–µ–Ω—Ç–∞ —â–µ —Å–µ –æ—Ç—Ä–∞...","–¢–∞–Ω–µ–≤–∞ –µ –ø—Ä–µ–¥—É–ø—Ä–µ–¥–∏–ª–∞, —á–µ –¥–æ–∫—É–º–µ–Ω—Ç—ä—Ç —â–µ —Å–µ –æ—Ç—Ä...",–¥–æ–∫—É–º–µ–Ω—Ç–∞,–¥–æ–∫—É–º–µ–Ω—Ç—ä—Ç
2,article_misuse,–ü–∞—Ç–æ–≥–µ–Ω–µ—Ç–∏—á–Ω–∏—è—Ç –º–µ—Ö–∞–Ω–∏–∑—ä–º –Ω–∞ —Ä–∞–∑–≤–∏—Ç–∏–µ—Ç–æ –Ω–∞ —Ö–∏–ø...,–ü–∞—Ç–æ–≥–µ–Ω–µ—Ç–∏—á–Ω–∏—è—Ç –º–µ—Ö–∞–Ω–∏–∑—ä–º –Ω–∞ —Ä–∞–∑–≤–∏—Ç–∏–µ—Ç–æ –Ω–∞ —Ö–∏–ø...,–∏–∑–ª–∏—à—ä–∫–∞,–∏–∑–ª–∏—à—ä–∫—ä—Ç


#### 7.1.4 Melt the dataset and sort pairs of correct/incorrect sentences to be together

In [None]:
grammar_errors.shape

(2349, 5)

In [None]:
grammar_errors = melt_and_sort_sentences(grammar_errors)

In [None]:
grammar_errors.shape

(4698, 4)

In [None]:
grammar_errors.head(4)

Unnamed: 0,text,is_correct,incorrect_words,correct_words
0,–û—Ç –∫–∞–∫–≤–æ –±–µ—à–µ –Ω–∞–ø—Ä–∞–≤–µ–Ω –≤—Ö–æ–¥–∞ –Ω–∞ –¥–≤–æ—Ä–∞ –Ω–∞ —Å–∫–∏–Ω–∏...,False,–≤—Ö–æ–¥–∞,–≤—Ö–æ–¥—ä—Ç
1,–û—Ç –∫–∞–∫–≤–æ –±–µ—à–µ –Ω–∞–ø—Ä–∞–≤–µ–Ω –≤—Ö–æ–¥—ä—Ç –Ω–∞ –¥–≤–æ—Ä–∞ –Ω–∞ —Å–∫–∏–Ω...,True,,
2,"–¢–∞–Ω–µ–≤–∞ –µ –ø—Ä–µ–¥—É–ø—Ä–µ–¥–∏–ª–∞, —á–µ –¥–æ–∫—É–º–µ–Ω—Ç–∞ —â–µ —Å–µ –æ—Ç—Ä–∞...",False,–¥–æ–∫—É–º–µ–Ω—Ç–∞,–¥–æ–∫—É–º–µ–Ω—Ç—ä—Ç
3,"–¢–∞–Ω–µ–≤–∞ –µ –ø—Ä–µ–¥—É–ø—Ä–µ–¥–∏–ª–∞, —á–µ –¥–æ–∫—É–º–µ–Ω—Ç—ä—Ç —â–µ —Å–µ –æ—Ç—Ä...",True,,


#### 7.1.5 Add NLP tags

<div style="background-color:bisque">‚ö†Ô∏èNote that calculating the features using Stanza takes around 1 hour on a laptop with average specs, therefore here we are loading a pre-saved file.

In [None]:
if False:
    feature_columns_st = ["pos", "tag", "dep", "morph", "lemmas", "left_edge", "right_edge", "num_tokens", "n_sents"]
    grammar_errors[feature_columns_st] = grammar_errors.apply(lambda r: extract_features(nlp_st, r, "text"), axis=1, result_type="expand")
    grammar_errors.to_csv("data/grammar_errors_with_nlp.csv", index=None)
else:
    # load from file instead of the above, it is faster
    grammar_errors = pd.read_csv("data/grammar_errors_with_nlp.csv")
    grammar_errors["correct_words"] = grammar_errors["correct_words"].fillna("")
    grammar_errors["incorrect_words"] = grammar_errors["incorrect_words"].fillna("")

#### 7.1.6 Filter only rows with 1 sentence, no quotes and other special characters

We need to filter out such texts since our rules can't deal with quoted text inside a sentence. Special characters also cause issues.

In [None]:
grammar_errors = grammar_errors[grammar_errors["n_sents"] == 1]

In [None]:
pattern = r"[\'\"‚Äò‚Äô‚Äú‚Äù¬∞/‚âà\:]"
grammar_errors = grammar_errors[~grammar_errors['text'].str.contains(pattern)]

In [None]:
grammar_errors.shape

(3605, 13)

### 7.2 Test with small subset of the real data

In [None]:
grammar_errors_10tokens = grammar_errors[grammar_errors["num_tokens"] < 10]

In [None]:
# execution will take 30 sec
passed, failed = test_definite_article_all(grammar_errors_10tokens, conditions, rules, print_passed=False, print_failed=True)

10 ‚ùå –°–∞–π—Ç–∞ –£—á–∞. —Å–µ –µ –≤ —Ç—è—Ö–Ω–∞ –ø–æ–º–æ—â. (Actual: '', Expected: '–°–∞–π—Ç–∞')
32 ‚ùå –ë–∏—Ö –∏—Å–∫–∞–ª –∏ –∞–∑ –¥–∞ –¥–∞–º —Å–≤–æ—è –ø—Ä–∏–Ω–æ—Å. (Actual: '', Expected: '—Å–≤–æ—è')
50 ‚ùå –ó–∞–µ–º–∞ –∏–º–∞ –ø—Ä–µ–¥–Ω–∞–∑–Ω–∞—á–µ–Ω–∏–µ –∑–∞ –æ–±–∑–∞–≤–µ–∂–¥–∞–Ω–µ –Ω–∞ –¥–æ–º–∞. (Actual: '', Expected: '–ó–∞–µ–º–∞')
258 ‚ùå –†–µ–π—Å –∫–æ–Ω—Ç—Ä–æ–ª–∞ –ø—É—Å–Ω–∞ –∫–æ–ª–∞ –∑–∞ —Å–∏–≥—É—Ä–Ω–æ—Å—Ç. (Actual: '–†–µ–π—Å', Expected: '–∫–æ–Ω—Ç—Ä–æ–ª–∞')
259 ‚ùå –†–µ–π—Å –∫–æ–Ω—Ç—Ä–æ–ª—ä—Ç –ø—É—Å–Ω–∞ –∫–æ–ª–∞ –∑–∞ —Å–∏–≥—É—Ä–Ω–æ—Å—Ç. (Actual: '–†–µ–π—Å', Expected: '')
332 ‚ùå –°–∞–π—Ç–∞ –µ —Ä–∞–∑—Ä–∞–±–æ—Ç–µ–Ω –æ—Ç –ú–µ–¥–∏—è –≥—Ä—É–ø 24 –û–û–î. (Actual: '', Expected: '–°–∞–π—Ç–∞')
338 ‚ùå –î–∞, —Ç–æ–≤–∞ –µ –£—á–∏—Ç–µ–ª—è—Ç. (Actual: '–£—á–∏—Ç–µ–ª—è—Ç', Expected: '')
339 ‚ùå –î–∞, —Ç–æ–≤–∞ –µ –£—á–∏—Ç–µ–ª—è—Ç. (Actual: '–£—á–∏—Ç–µ–ª—è—Ç', Expected: '')
366 ‚ùå –¢—Ä–∏—É–º–≤–∏—Ä–∞—Ç–∞ –µ –º—ä—Ä—Ç—ä–≤. (Actual: '', Expected: '–¢—Ä–∏—É–º–≤–∏—Ä–∞—Ç–∞')
416 ‚ùå –ü–∏–∫–∞ –Ω–∞ –µ—Å–µ–Ω 20

Let's review the outcomes.

**To begin with**, the rules failed to detect errors in nearly a quarter (24%) of the tests.
However, upon closer inspection of the failures, a pattern emerges. Upon closer look, it is evident that most of the failed tests have even-numbered indices. This suggests that while correctly formed sentences were accurately identified, not all erroneous sentences were successfully detected.

Let's take a closer look at the instances where the tests failed for sentences that were initially correct:

In [None]:
grammar_errors.loc[[338, 339, 2022, 2023, 3396, 3397, 3942, 3943 ]][['text', 'is_correct', 'incorrect_words', 'correct_words']]

Unnamed: 0,text,is_correct,incorrect_words,correct_words
338,"–î–∞, —Ç–æ–≤–∞ –µ –£—á–∏—Ç–µ–ª—è—Ç.",False,,
339,"–î–∞, —Ç–æ–≤–∞ –µ –£—á–∏—Ç–µ–ª—è—Ç.",True,,
2022,–ê–¥–º–∏–Ω–∏—Å—Ç—Ä–∞—Ç–æ—Ä—ä—Ç –Ω–∞ —Ñ–æ—Ä—É–º–∞ —Ç–∞–∫–∞ –µ —Ä–µ—à–∏–ª.,False,–ê–¥–º–∏–Ω–∏—Å—Ç—Ä–∞—Ç–æ—Ä—ä—Ç,–ê–¥–º–∏–Ω–∏—Å—Ç—Ä–∞—Ç–æ—Ä–∞
2023,–ê–¥–º–∏–Ω–∏—Å—Ç—Ä–∞—Ç–æ—Ä–∞ –Ω–∞ —Ñ–æ—Ä—É–º–∞ —Ç–∞–∫–∞ –µ —Ä–µ—à–∏–ª.,True,,
3396,–û—Ñ–∏—Ü–∏–∞–ª–Ω–∏—è—Ç –µ–∑–∏–∫ –Ω–∞ —Ñ–æ—Ä—É–º–∞ –µ –±—ä–ª–≥–∞—Ä—Å–∫–∏—è—Ç.,False,–û—Ñ–∏—Ü–∏–∞–ª–Ω–∏—è—Ç,–û—Ñ–∏—Ü–∏–∞–ª–Ω–∏—è
3397,–û—Ñ–∏—Ü–∏–∞–ª–Ω–∏—è –µ–∑–∏–∫ –Ω–∞ —Ñ–æ—Ä—É–º–∞ –µ –±—ä–ª–≥–∞—Ä—Å–∫–∏—è—Ç.,True,,
3942,–¢–æ–≤–∞ –µ —É—Å–ø–æ–∫–æ–∏—Ç–µ–ª–Ω–∏—è—Ç —Ö–∞–ø –∑–∞ —Å—ä–≤–µ—Å—Ç—Ç–∞ –º—É ...,False,—É—Å–ø–æ–∫–æ–∏—Ç–µ–ª–Ω–∏—è—Ç,—É—Å–ø–æ–∫–æ–∏—Ç–µ–ª–Ω–∏—è
3943,–¢–æ–≤–∞ –µ —É—Å–ø–æ–∫–æ–∏—Ç–µ–ª–Ω–∏—è —Ö–∞–ø –∑–∞ —Å—ä–≤–µ—Å—Ç—Ç–∞ –º—É ...,True,,


The sentence _–î–∞, —Ç–æ–≤–∞ –µ –£—á–∏—Ç–µ–ª—è—Ç_ is listed as both correct and incorrect.

The sentence _–ê–¥–º–∏–Ω–∏—Å—Ç—Ä–∞—Ç–æ—Ä–∞ –Ω–∞ —Ñ–æ—Ä—É–º–∞ —Ç–∞–∫–∞ –µ —Ä–µ—à–∏–ª_ is listed as correct, but in fact it is not.

The sentence _–¢–æ–≤–∞ –µ —É—Å–ø–æ–∫–æ–∏—Ç–µ–ª–Ω–∏—è —Ö–∞–ø –∑–∞ —Å—ä–≤–µ—Å—Ç—Ç–∞ –º—É_ is also listed as correct, but in fact it is incorrect.

Therefore, in at least these 3 examples, the rules correctly identified the error, but the expected result was wrong bcause the original data in the dataset was wrong.

The assumption about the original wrong data is, that the data was scraped from various sources and considered "correct", then errors were automatically introduced to produce the incorrect version. However, it's important to note that the source material itself may not always be grammatically accurate. For exmaple, the sentence _–¢–æ–≤–∞ –µ —É—Å–ø–æ–∫–æ–∏—Ç–µ–ª–Ω–∏—è —Ö–∞–ø –∑–∞ —Å—ä–≤–µ—Å—Ç—Ç–∞ –º—É_ can be found [in a blog comment from 2010](https://petdoshkov.blog.bg/drugi/2010/05/10/vyzzivnoto-reshenie-za-nakazanieto-zabelejka.542411). This example demonstrates that even the original text contained grammatical errors, challenging the assumption that the initial data was entirely correct before deliberate mistakes were added.




**Another reason** of the many failures is that we still haven't implemented all rules listed in the Grammas section. Additionally, we tested on a small set of 39 pairs of simple sentences so we didn't verify our "model" with enough data. Another explanation is that in the NLP features we extracted there are two still unused ones - left edge and right edge.

### 7.3 Test with the whole dataset

In [None]:
grammar_errors.shape

(3605, 13)

<div style="background-color:Bisque">

‚ö†Ô∏èThe below test will take ~ 18 minutes. As an alternative to executing it you may take a look at the screenshot.

In [None]:
if False:
    start_time = time.time()

    passed, failed = test_definite_article_all(grammar_errors, conditions, rules, print_passed=False, print_failed=False)

    end_time = time.time()
    execution_time_st = end_time - start_time
    print(f"Execution time (stanza): {execution_time_st} seconds")

Screenshot of the result:

<img src="images/whole_dataset_result.png">

Testing with the whole dataset we see that one third of the tests failed. Certainly there is room for improvement!

## Summary and insights gained

Although the results of the final test were not as encouraging as hoped, the project still yielded valuable insights.

Initially, selecting the appropriate NLP library was a challenging task. While UDPipe offered good performance, it lacked accuracy, and although Stanza delivered better results, it was much slower. This experience highlighted the importance of carefully balancing accuracy and performance when choosing tools for linguistic analysis.

Another key realization was the unexpected complexity of Bulgarian grammar, particularly the rules surrounding the use of the definite article. This complexity necessitated a deeper investigation into the language's linguistic structures.

Parsing text to detect incorrect definite articles also proved to be more complicated than expected. The development of effective rules for this task was hindered by the diversity of sentence structures encountered.

Additionally, the testing dataset posed its own challenges, as it contained inaccuracies that affected the validation process. This emphasized the crucial need for high-quality, accurate datasets.

Despite these obstacles, the implementation of rule-based methods produced promising results, especially in identifying errors in shorter sentences. This success demonstrated that even in the face of linguistic complexity, well-designed rules are essential for achieving accurate outcomes.

The insights gained from this effort will undoubtedly contribute to the broader field of computational linguistics and inspire more accurate and efficient solutions for grammatical analysis in Bulgarian. As we continue to refine our approach and expand our rule set, we are confident in our ability to develop a robust and reliable tool for checking the correctness of definite articles in Bulgarian texts.

**References**

<div id="ref1">[1] Astoria Academy, <a href="https://astoria-academy.com/the-definite-articles-of-bulgarian/">The Definite articles of Bulgarian,</a> 2023</div>

<div id="ref2">[2] CoLanguage <a href="https://www.colanguage.com/definite-article-bulgarian-nouns">Definite article of the Bulgarian nouns</a></div>

<div id="ref3">[3] –≤. ‚Äû–ê–∑ –ë—É–∫–∏‚Äú –±—Ä. 16 <a href="https://ibl.bas.bg/ezikovi_spravki/otnovo-za-palniya-i-kratkiya-tchlen/">–û—Ç–Ω–æ–≤–æ –∑–∞ –ø—ä–ª–Ω–∏—è –∏ –∫—Ä–∞—Ç–∫–∏—è —á–ª–µ–Ω</a></div>

<div id="ref4">[4] Raquel Jacob <a href="https://help.unbabel.com/hc/en-us/articles/360022878854-Language-Guidelines-Bulgarian">Language Guidelines ‚Äì Bulgarian</a></div>

<div id="ref5">[5] Andonova, Sabeva, Zagorova <a href="https://caritas.bg/cms/wp-content/uploads/2015/04/A1-English.pdf?x10535">Bulgarian for Refugees,</a> 2014</div>

<div id="ref6">[6] John Leafgren <a href="http://www.seelrc.org:8080/grammar/pdf/stand_alone_bulgarian.pdf">A Concise Bulgarian
Grammar</a></div>

<div id="ref7">[7] G. Popova <a href="https://www.english-linguistics.de/archives/clark/SIMOV/CM/popova.pdf">Towards an HPSG Account of the
Bulgarian Definite Article</a></div>

<div id="ref8">[8] K. Bontcheva <a href="https://theswissbay.ch/pdf/Books/Linguistics/Mega%20linguistics%20pack/Indo-European/Balto-Slavic/Bulgarian%20Grammar%2C%20Elementary%20%28Bontcheva%29.pdf">Bulgarian Language - Grammar</a></div>