In [1]:
!pip install python-docx
!python -m spacy download ru_core_news_lg

Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/253.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m245.8/253.0 kB[0m [31m8.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.2.0
Collecting ru-core-news-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_lg-3.8.0/ru_core_news_lg-3.8.0-py3-none-any.whl (513.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m513.4/513.4 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymorphy3>=1.0.0 (from ru-core-news-lg==3.8.0)
  Downloading pymorphy3-2.0.

In [None]:
import re
import unicodedata
from docx import Document
import pandas as pd
from tqdm import tqdm


def is_upper_like(word):
    """Слово капсовое, если все буквы заглавные (и допускается символ №)."""
    letters_only = "".join(c for c in word if unicodedata.category(c).startswith('L'))
    if not letters_only:
        return False
    return all(c.isupper() for c in letters_only) or '№' in word



def is_class_number(word):
    """Проверка на классные цифры I, II, III, IV, V, IУ, У"""
    class_numbers = {"I", "II", "III", "IV", "V", "IУ", "У"}
    parts = re.split(r',\s*', word.strip(','))
    return all(part in class_numbers for part in parts)


def is_bold_run(run):
    """Определяет, является ли run жирным любым способом."""
    try:
        if run.bold:
            return True
        if hasattr(run.font, "bold") and run.font.bold:
            return True
        if hasattr(run.style, "font") and getattr(run.style.font, "bold", False):
            return True
        if hasattr(run.style, "name") and "Bold" in str(run.style.name):
            return True
    except Exception:
        pass
    return False


def extract_lemma(text, para=None):
    """Извлекает лемму из текста абзаца по правилам."""
    if not text or not text.strip():
        return None

    text = re.sub(r'\s+', ' ', text.strip())
    tokens = [t for t in text.split() if not is_class_number(t)]
    text = " ".join(tokens)

    tokens = text.split()
    capseq = []
    for tok in tokens:
        clean_tok = tok.strip("()[]{}.,;!?")
        if is_upper_like(clean_tok):
            capseq.append(tok)
        elif re.match(r'^[()]*[A-ZА-ЯЁIУV]+[()]*$', tok):
            capseq.append(tok)
        else:
            break

    if not capseq:
        return None

    cap_part = " ".join(capseq).strip()

    if cap_part.endswith(":"):
        if not para:
            return cap_part.rstrip(":").strip()

        bold_text = ""
        for run in para.runs:
            if is_bold_run(run):
                bold_text += run.text + " "

        bold_text = re.sub(r'\s+', ' ', bold_text.strip())
        bold_words = bold_text.split()

        after_colon = text.split(":", 1)[1].strip() if ":" in text else ""
        after_tokens = after_colon.split()

        cap_after = []
        for tok in after_tokens:
            if tok in bold_words:
                cap_after.append(tok)
            else:
                break

        if cap_after:
            return f"{cap_part.rstrip(':')}: {' '.join(cap_after)}"
        else:
            return cap_part.rstrip(":").strip()
    else:
        return cap_part


def extract_definition(text, lemma):
    """Удаляет лемму из начала текста и возвращает оставшееся определение."""
    if not text or not lemma:
        return None

    text = re.sub(r'\s+', ' ', text.strip())
    lemma = re.sub(r'\s+', ' ', lemma.strip())
    lemma_clean = lemma.rstrip(":").strip()

    pattern = re.escape(lemma_clean)
    definition = re.sub(rf'^{pattern}[:\s-]*', '', text, flags=re.IGNORECASE).strip()

    return definition if definition else None


def split_definitions(definition_text):
    """
    Делит текст словарной статьи на отдельные значения по шаблонам 1., 2., 3. или 1), 2), 3).
    """
    if not definition_text or not isinstance(definition_text, str):
        return []

    text = re.sub(r'\s+', ' ', definition_text.strip())
    match = re.search(r'\b1[.)]', text)
    if match:
        text = text[match.start():]
    else:
        return [text]

    parts = re.split(r'(?:(?<=\s)|^)\d+[.)]\s*', text)
    return [p.strip() for p in parts if p.strip()]


def extract_morphology(entry: str) -> str:
    """
    Извлекает морфологические данные из словарной статьи:
    грамматический класс (I, II, III, IV, V, IУ, У),
    мн. (множественное число),
    морфологические формы в скобках.
    Если встречается '1.' или '1)', обрезает всё, что идёт после них.
    """

    entry = re.split(r'\b1[.)]', entry, maxsplit=1)[0].strip()

    roman_seq_match = re.search(
        r'\b(?:IУ|IV|V|III|II|I|У)(?:\s*,\s*(?:IУ|IV|V|III|II|I|У))*\b', entry
    )

    first_bracket = re.search(r'\([^)]*\)', entry)

    bracket_match = None
    if first_bracket and '-' in first_bracket.group(0):
        bracket_match = first_bracket

    plural_text = None
    if bracket_match:
        start = bracket_match.start()
        before = entry[max(0, start - 5):start]
        if re.search(r'мн\.\s*$', before):
            plural_text = "мн. " + bracket_match.group(0)
        else:
            plural_text = bracket_match.group(0)
    else:
        if re.search(r'\bмн\.\b', entry):
            plural_text = "мн."

    found_parts = []
    if bracket_match:
        found_parts.append((bracket_match.start(), plural_text))
    if roman_seq_match:
        found_parts.append((roman_seq_match.start(), roman_seq_match.group(0)))

    found_parts.sort(key=lambda x: x[0])

    return " ".join(p for _, p in found_parts if p).strip()


file_path = "Гунзибско-русский словарь_new.docx"
doc = Document(file_path)

rows = []
no_lemma = []

for para in tqdm(doc.paragraphs, desc="Обработка абзацев"):
    text = para.text.strip()
    if not text:
        continue

    lemma = extract_lemma(text, para)
    if lemma:
        definition = extract_definition(text, lemma)
        if definition:
            divided_defs = split_definitions(definition)
            rows.append({
                "lemma": lemma,
                "definition": definition,
                "definition_divided_list": divided_defs
            })
    else:
        no_lemma.append(text)

df = pd.DataFrame(rows)
df = df[df["definition"].notna() & (df["definition"].str.strip() != "")].reset_index(drop=True)

final_rows = []
id_word = 0
global_id = 0

for _, row in df.iterrows():
    id_word += 1
    id_meaning = 0
    for def_part in row["definition_divided_list"]:
        id_meaning += 1
        global_id += 1
        final_rows.append({
            "id_word": id_word,
            "id_meaning": id_meaning,
            "id": global_id,
            "lemma": row["lemma"],
            "definition": row["definition"],
            "definition_divided": def_part
        })

df_final = pd.DataFrame(final_rows)

morphologies = []
clean_divided = []

for _, r in df_final.iterrows():
    def_text = r["definition"]
    div_text = r["definition_divided"]

    morph_def = extract_morphology(def_text)
    morph_div = extract_morphology(div_text)

    new_div_text = div_text
    morphology = None

    if not morph_def and not morph_div:
        morphology = None
    elif morph_def == morph_div or not morph_div:
        morphology = morph_def

        if morph_def:
            new_div_text = re.sub(re.escape(morph_def), "", div_text, flags=re.IGNORECASE).strip()
    else:
        morphology = f"{morph_def or ''}; {morph_div or ''}".strip("; ")
        if morph_div:
            new_div_text = re.sub(re.escape(morph_div), "", div_text, flags=re.IGNORECASE).strip()

    morphologies.append(morphology)
    clean_divided.append(re.sub(r'\s+', ' ', new_div_text.strip()) if new_div_text else None)

df_final["morphology"] = morphologies
df_final["definition_divided"] = clean_divided

output_path = "result_stage_1.xlsx"
df_final.to_excel(output_path, index=False)

if no_lemma:
    df_no = pd.DataFrame({"no_lemma_paragraph": no_lemma})
    no_path = "no_lemma_paragraphs.xlsx"
    df_no.to_excel(no_path, index=False)
    print(f"Не удалось извлечь лемму из {len(no_lemma)} абзацев. Сохранено в {no_path}")
else:
    print("Все абзацы успешно обработаны!")

print(f"Готово! Найдено лемм: {df_final['id_word'].nunique()}, строк всего: {len(df_final)}")
print(f"Файл сохранён: {output_path}")

Обработка абзацев: 100%|██████████| 9336/9336 [00:24<00:00, 375.40it/s]


Не удалось извлечь лемму из 23 абзацев. Сохранено в drive/MyDrive/Практика - словари/no_lemma_paragraphs.xlsx
Готово! Найдено лемм: 9122, строк всего: 11952
Файл сохранён: drive/MyDrive/Практика - словари/result_stage_1.xlsx


In [None]:
import pandas as pd
import re
import spacy

nlp = spacy.load("ru_core_news_lg", disable=["ner"])

IGNORED_LABELS = [
    "воен.", "перен.", "спец.", "разг.", "уст.", "ирон.", "поэт.", "анат.", "зоол.",
    "бот.", "рел.", "астр.", "ген.", "физиол.", "ист.", "брaн.", "миф.", "фольк.",
    "с.-х.", "геогр.", "спорт.", "кулин.", "дет.", "игр.", "неодобр.", "пренебр.", "шутл."
]

def clean_phrase(phrase: str) -> str:
    """
    Убирает стилистические/тематические метки в начале строки (воен., перен., уст. и т.п.)
    """
    phrase = phrase.strip()
    pattern = r"^(?:" + "|".join(re.escape(l) for l in IGNORED_LABELS) + r")\s*"
    phrase_cleaned = re.sub(pattern, "", phrase, flags=re.IGNORECASE)
    return phrase_cleaned


def detect_pos(text: str):
    """
    Определяет часть речи для строки по описанным правилам.
    Возвращает кортеж (pos, detected_by)
    """

    if not isinstance(text, str) or not text.strip():
        return "NA", "none"

    has_expression = any(x in text for x in ["♦", "погов.", "посл."])

    markers = {
        "сущ.": "noun",
        "им.": "noun",
        "прил.": "adj",
        "адъектив.": "adj",
        "качеств.": "adj",
        "гл.": "verb",
        "масд.": "masdar",
        "инф.": "verb",
        "прич.": "verb",
        "деепр.": "verb",
        "учащ.": "verb",
        "понуд.": "verb",
        "учащ. понуд.": "verb",
        "союз": "conj",
        "послелог": "post",
        "предлог": "adp",
        "част.": "part",
        "межд.": "intj",
        "звукоподр.": "onomatope",
        "звукосимв.": "onomatope",
        "числ.": "num",
        "мест.": "pron",
        "нареч.": "adv",
    }

    first_part = text.split(";")[0].strip()

    for marker, pos in markers.items():
        if marker in first_part:
            result = pos
            detected_by = "source"
            break
    else:
        # если маркеров нет, анализируем через spaCy
        detected_by = "spacy_model"
        phrases = [p.strip() for p in first_part.split(",") if p.strip()]
        pos_counts = {}

        for phrase in phrases:
            phrase_clean = clean_phrase(phrase)
            doc = nlp(phrase_clean)

            if len(doc) == 0:
                pos_counts["NA"] = pos_counts.get("NA", 0) + 1
                continue

            root = next((t for t in doc if t.dep_ == "ROOT"), doc[0])
            main_pos = root.pos_

            # если фраза начинается с предлога, а root — существительное,
            # считаем конструкцию наречием
            starts_with_prep = len(doc) > 0 and doc[0].pos_ == "ADP"
            if starts_with_prep and main_pos == "NOUN":
                main_pos = "ADV"

            pos_map = {
                "NOUN": "noun",
                "PROPN": "noun",
                "ADJ": "adj",
                "VERB": "verb",
                "AUX": "verb",
                "ADV": "adv",
                "ADP": "adp",
                "CCONJ": "conj",
                "SCONJ": "conj",
                "PRON": "pron",
                "NUM": "num",
                "PART": "part",
                "INTJ": "intj",
                "SYM": "onomatope",
                "X": "NA",
            }

            mapped = pos_map.get(main_pos, "NA")
            pos_counts[mapped] = pos_counts.get(mapped, 0) + 1

        if pos_counts:
            result = max(pos_counts, key=pos_counts.get)
        else:
            result = "NA"

    if has_expression:
        result += "; expression"

    return result, detected_by


def process_excel(input_path: str, output_path: str):
    df = pd.read_excel(input_path)

    required_cols = ["definition_divided"]
    for col in required_cols:
        if col not in df.columns:
            raise ValueError(f"В файле отсутствует обязательный столбец: {col}")

    results = df["definition_divided"].apply(detect_pos)
    df["pos"] = results.apply(lambda x: x[0])
    df["detected_by"] = results.apply(lambda x: x[1])

    df.to_excel(output_path, index=False)
    print(f"Файл успешно сохранён: {output_path}")


if __name__ == "__main__":
    process_excel(
        "result_stage_1.xlsx",
        "result_stage_2.xlsx"
    )

Файл успешно сохранён: drive/MyDrive/Практика - словари/result_stage_2.xlsx


In [None]:
import pandas as pd

def replace_number_sign(text: str) -> str:
    """Заменяет все символы № в строке на надстрочный ᵸ."""
    return text.replace('№', 'ᵸ')

our_data = pd.read_excel("result_stage_2.xlsx")
our_data = our_data.map(lambda x: replace_number_sign(x) if isinstance(x, str) else x)
our_data['lemma'] = our_data['lemma'].str.lower()
our_data['language'] = 'Hunzib'
our_data['glottocode'] = 'hunz1247'
our_data['annotator'] = 'Alesya Voinskaya'
our_data['reference'] = 'Khalilov 2026'
our_data.to_excel("result_stage_2.xlsx", index=False)

In [None]:
import re
import pandas as pd
from typing import Dict
import unicodedata

vowels = {

    "а̄ᵸ": "ãː",
    "а̄": "aː",
    "аᵸ": "ã",
    "а": "a",

    "ā": "aː",

    "ǡᵸ": "ɑ̃",
    "ǡ": "ɑː",
    "ȧᵸ": "ɑ̃",
    "ȧ": "ɑ",

    "ǡ": "ɑː",

    "ēᵸ": "ẽː",
    "ē": "eː",
    "еᵸ": "ẽ",
    "е": "e",
    "ё": "e",

    "ӣᵸ": "ĩː",
    "ӣ": "iː",
    "иᵸ": "ĩ",
    "и": "i",

    "ӣ": "iː",

    "ōᵸ": "õː",
    "ō": "oː",
    "оᵸ": "õ",
    "о": "o",

    "ӯᵸ": "ũː",
    "ӯ": "uː",
    "уᵸ": "ũ",
    "у": "u",

    "ӯ": "uː",

    "э̄ᵸ": "ʔẽː",
    "э̄": "ʔeː",
    "эᵸ": "ʔẽ",
    "э": "ʔe",

    "ы̄ᵸ": "ɨ̃ː",
    "ы̄": "ɨː",
    "ыᵸ": "ɨ̃",
    "ы": "ɨ",
    "ьi": "ɨ",

    "ә̄ᵸ": "ə̃ː",
    "ә̄": "əː",
    "әᵸ": "ə̃",
    "ǝ": "ə",

    "ә̄ᵸ": "ə̃ː",
    "ә̄": "əː",
    "әᵸ": "ə̃",
    "ә": "ə",

    "ǝ̄ᵸ": "ə̃ː",
    "ǝ̄": "əː",
    "ǝᵸ": "ə̃",
    "ǝ": "ə",

    "ə̄": "əː"
}

consonants = {

    "б": "b",
    "в": "w",
    "г": "g",
    "гъ": "ʁ",
    "гь": "h",
    "гI": "ʕ",
    "гi": "ʕ",
    "гі": "ʕ",
    "д": "d",
    "ж": "ž",         # ž [ʒ]
    "з": "z",

    "й": "j",
    "к": "k",
    "къ": "qʼ",
    "кь": "ƛ’",       # tɬ’
    "кI": "kʼ",
    "кi": "kʼ",

    "л": "l",
    "лъ": "ɬ",
    "лъ": "ɬ",
    "лI": "ƛ",        # tɬ
    "лi": "ƛ",

    "м": "m",
    "н": "n",

    "п": "p",
    "пI": "pʼ",
    "пi": "pʼ",

    "р": "r",
    "с": "s",

    "т": "t",
    "тI": "tʼ",
    "тi": "tʼ",

    "х": "χ",
    "хъ": "q",        # в таблице перемешаны q и qχ - записала как q
    "хь": "x",
    "хI": "ħ",
    "хi": "ħ",
    "хі":"ħ",

    "ц": "c",         # c [ts]
    "цI": "cʼ",       # c’ [tsʼ]
    "цi": "cʼ",

    "ч": "č",
    "чI": "čʼ",
    "чi": "čʼ",

    "ш": "š",

    "ъ": "ʔ",

    # знаки

    "-": "-",  # добавили также другие знаки, т.к. хотим их сохранить
    "(": "(",
    ")": ")",
    "§§": "§§"
}

import re
import unicodedata
from typing import Dict

def transliterate_and_check(s: str,
                            vowels_map: Dict[str, str] = vowels,
                            cons_map: Dict[str, str] = consonants) -> str:
    original_s = s

    s = s.replace("!", "").replace(";", "").replace(" iⅴ", "").replace("...", "").replace(" i.", "")
    s = s.replace("//", "§§")
    s = s.replace("/", "")
    if ":" in s:
        s = s.split(":")[0]
    if " (" in s:
        s = s.split(" (")[0]

    s = unicodedata.normalize("NFD", s)

    s = re.sub(r"\d", "", s)

    # Удаляем ударения и подобные диакритические знаки
    accents_to_remove = {
        '\u0301',
        '\u0300',
        '\u030B',
        '\u0341',
        '\u02CA',
        '\u02CB',
    }
    s = ''.join(c for c in s if c not in accents_to_remove)

    # склеиваем надстрочный ᵸ с предыдущей гласной
    vowels_set = set(vowels_map.keys())
    result = ""
    for i, c in enumerate(s):
        if c == "ᵸ" and i > 0 and s[i-1] in vowels_set:
            # заменяем предыдущий символ на "гласная+ᵸ"
            result = result[:-1] + s[i-1] + "ᵸ"
        elif c != "ᵸ":
            result += c
        # если ᵸ без гласной перед ним — просто игнорируем
    s = result

    s = unicodedata.normalize("NFC", s)

    mapping = {**vowels_map, **cons_map}

    keys_sorted = sorted(mapping.keys(), key=len, reverse=True)
    escaped_keys = [re.escape(k) for k in keys_sorted]
    pattern = re.compile("|".join(escaped_keys))

    def _repl(m: re.Match) -> str:
        return mapping[m.group(0)] + "-"

    replaced = pattern.sub(_repl, s)


    test_var = replaced
    values_sorted = sorted(set(mapping.values()), key=len, reverse=True)
    for val in values_sorted:
        if val == "":
            continue
        test_var = test_var.replace(val, "")
    if test_var.strip() != "":
          print(original_s)
        #else:
          #print("Необработанный остаток после удаления значений словарей:")
          #print(repr(test_var))
          #print(f"Изначальное слово: {original_s!r}")
          #raise RuntimeError("Некоторые символы не были заменены соответствующими значениями словарей.")


    replaced_with_dashes = replaced.strip()
    if replaced_with_dashes.endswith("-"):
        replaced_with_dashes = replaced_with_dashes[:-1]
    replaced_with_dashes = replaced_with_dashes.replace("-ʼ", "ʼ")
    replaced_with_dashes = replaced_with_dashes.replace("(-", "(")
    replaced_with_dashes = replaced_with_dashes.replace("-)", ")")
    replaced_with_dashes = replaced_with_dashes.replace("---", " ")
    replaced_with_dashes = replaced_with_dashes.replace("-§§-", "//")

    return replaced_with_dashes


df = pd.read_excel("result_stage_2.xlsx")
df["ipa"] = df["lemma"].apply(lambda x: transliterate_and_check(str(x)))
df.to_excel("result_stage_3.xlsx", index=False)

бəдǡ̄кiекi/а
гьазарс̄ə
гьǡ̄̄сбурти
kьaлaki
kьȧрхiаh
кючi
кючi
мийавдāрл'ьи
нишерекǡ̄
т̄ǡ̄цi/а
т̄ǡ̄цiекi/а
т̄ǡ̄цiекiерлъи
тіагьарат
тіарикъат
хіаким-сукіу
цǡ̄-цȧсс/ə


In [None]:
df = pd.read_excel("result_stage_3.xlsx")

new_order = ['id_word',	'id_meaning',	'id',	'lemma',	'morphology',	'definition',	'definition_divided',	'glottocode',	'reference', 'language', 'pos', 'detected_by']

df = df[new_order]

df.to_excel("Hunzib_dictionary.xlsx", index=False)