# Загрузка библиотек. Подготовка данных

In [7]:
import os
import csv
import stanza
from collections import Counter
import glob
import pandas as pd

In [2]:
# Входная папка с леммами
lemmas_folder = os.path.join("..", "data", "lemmas")

In [9]:
# Выходная папка для POS-статистик
output_folder = os.path.join("..", "output", "POS")
os.makedirs(output_folder, exist_ok=True)

# Получаем только файлы, начинающиеся с "cleaned"
input_files = glob.glob(os.path.join(lemmas_folder, "cleaned*.csv"))

# Определяем и считаем части речи

In [12]:
# Загружаем латинскую NLP-модель
stanza.download('la')
nlp = stanza.Pipeline(lang='la', processors='tokenize,pos')

# Обрабатываем каждый подходящий файл
for csv_path in input_files:
    filename = os.path.basename(csv_path)
    print(f"Обработка файла: {filename}")

    try:
        df = pd.read_csv(csv_path, encoding="utf-8")
    except Exception as e:
        print(f"Ошибка при чтении {filename}: {e}")
        continue

    if "lemma" not in df.columns:
        print(f"Пропущен: нет столбца 'lemma' в {filename}")
        continue

    text_data = df["lemma"].dropna().astype(str).tolist()
    text = " ".join(text_data)

    doc = nlp(text)

    pos_counter = Counter()
    for sentence in doc.sentences:
        for word in sentence.words:
            pos_counter[word.upos] += 1
            print(f"  → {word.text}: {word.upos}")

    result_df = pd.DataFrame(pos_counter.items(), columns=["Часть речи", "Количество"])

    # Формируем путь для сохранения
    output_path = os.path.join(
        output_folder,
        filename.replace(".csv", "_pos_stats.csv")
    )

    result_df.to_csv(output_path, index=False, encoding="utf-8")
    print(f"Сохранено в: {output_path}\n")


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 426kB [00:00, 1.91MB/s]                    
2025-05-11 18:04:03 INFO: Downloaded file to /Users/sy/stanza_resources/resources.json
2025-05-11 18:04:03 INFO: Downloading default packages for language: la (Latin) ...
2025-05-11 18:04:05 INFO: File exists: /Users/sy/stanza_resources/la/default.zip
2025-05-11 18:04:08 INFO: Finished downloading models and saved to /Users/sy/stanza_resources
2025-05-11 18:04:08 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 426kB [00:00, 36.4MB/s]                    
2025-05-11 18:04:08 INFO: Downloaded file to /Users/sy/stanza_resources/resources.json
2025-05-11 18:04:08 INFO: Loading these models for language

Обработка файла: cleaned_anonymous_angelus.predicted_лем.csv
  → angelus: NOUN
  → anima: NOUN
  → Abyssus: NOUN
  → Abies: NOUN
  → Abortiuum: ADJ
  → accipiter: ADV
  → Acetum: NOUN
  → Acies: NOUN
  → aceruus: ADJ
  → accubitus: VERB
  → Acus: NOUN
  → adolescentula: VERB
  → Adeps: NOUN
  → Adamas: NOUN
  → aer: NOUN
  → ager: NOUN
  → agnus: NOUN
  → Agricola: NOUN
  → altare: NOUN
  → alitus: ADJ
  → Ala: NOUN
  → alienus: ADJ
  → altitudo: NOUN
  → Albugus: NOUN
  → Aloe: NOUN
  → altilis: ADJ
  → amicus: NOUN
  → amicus: NOUN
  → Amphora: NOUN
  → Amigdalus: ADJ
  → annulus: NOUN
  → annus: NOUN
  → Ancilla: NOUN
  → anaglypha: VERB
  → antrum: NOUN
  → antemurale: ADJ
  → animal: NOUN
  → angulus: NOUN
  → Apis: NOUN
  → aper: NOUN
  → aquilo: NOUN
  → aqua: NOUN
  → aquila: NOUN
  → Arturus: ADJ
  → Arista: ADJ
  → arcus: NOUN
  → argentum: NOUN
  → Aries: NOUN
  → armilla: NOUN
  → Arena: NOUN
  → armum: NOUN
  → Archaarca: ADJ
  → Arula: NOUN
  → armo: NOUN
  → artifex: NOU