In [None]:
"""
shona_morph.py

Shona morphological analyzer as a spaCy pipeline component.
Uses a baseline/mkanganwi root dictionary + noun class (mipanda) prefixes.
Annotates tokens with ._.shona (dict) containing:
  - prefixes: list[str]
  - root: str
  - suffixes: list[str]
  - lemma: str
  - meaning: str
  - noun_class_num: int or None
  - noun_class_label: str or None
"""

from typing import List, Dict, Optional
import spacy
from spacy.tokens import Doc, Token, Span

class ShonaAnalyzerSimple:
    def __init__(self, mode: str = "baseline"):
        self.mode = mode
        if mode == "baseline":
            self.INFLECTIONAL_PREFIXES = [
                # common inflectional prefixes + many noun-class prefixes
                "ndi","nd","va","v","ha","ta","ma","chi","zvi","ru","ka","tu","hu","ku","pa","mu","ri","sa","se","yo",
                # noun class single-token forms
                "mw", "mi", "m", "zv", "dz", "dzv", "sv", "zi", "kw", "tw", "ruw"
            ]
            self.DERIVATIONAL_SUFFIXES = [
                "a","i","e","o","an","ana","sa","tu",
                "is","ir","er","ur","unur","w","iw","irw","ri"
            ]
            self.ROOTS = {
                "famb":"walk",
                "gar":"sit/stay",
                "tuk":"scold",
                "bik":"cook",
                "sung":"tie",
                "dy":"eat",
            }
        elif mode == "mkanganwi":
            self.INFLECTIONAL_PREFIXES = [
                "ha","ndi","va","ti","ri","ku","mu","chi","zvi","ru","ma","pa","sa","se","yo",
                "mw","mi","m","zv","dz","sv","zi","tw"
            ]
            self.DERIVATIONAL_SUFFIXES = [
                "a","i","e","o","an","ana","sa","tu",
                "is","ir","er","ur","unur","w","iw","irw"
            ]
            self.ROOTS = {
                "tuk":"scold",
                "famb":"walk",
                "gar":"sit/stay",
                "sung":"tie",
                "bik":"cook",
                "nzwa":"hear/feel",
                "da":"love",
            }
        else:
            raise ValueError("Mode must be 'baseline' or 'mkanganwi'.")

    def strip_prefixes(self, word: str) -> (List[str], str):
        """
        Greedy-first-match prefix stripping: collect at most one prefix by default.
        We allow repeated stripping if multiple prefixes are plausible.
        """
        prefixes = []
        root = word
        changed = True
        # try a couple of iterations to catch stacked prefixes like 'va-mu-...' (rare)
        for _ in range(3):
            matched = False
            # sort by length descending to prefer longer matches (e.g., 'zvi' before 'z')
            for pref in sorted(self.INFLECTIONAL_PREFIXES, key=lambda s: -len(s)):
                if root.startswith(pref) and len(root) > len(pref) + 1:
                    prefixes.append(pref)
                    root = root[len(pref):]
                    matched = True
                    break
            if not matched:
                break
        return prefixes, root

    def strip_suffixes(self, root: str) -> (List[str], str):
        """
        Greedy-first-match suffix stripping.
        """
        suffixes = []
        cur = root
        for _ in range(3):
            matched = False
            for suf in sorted(self.DERIVATIONAL_SUFFIXES, key=lambda s: -len(s)):
                if cur.endswith(suf) and len(cur) > len(suf) + 1:
                    suffixes.append(suf)
                    cur = cur[:-len(suf)]
                    matched = True
                    break
            if not matched:
                break
        return suffixes, cur

    def analyze_word(self, word: str) -> Dict:
        word = word.strip()
        analysis = {"word": word, "prefixes": [], "root": word, "suffixes": [], "lemma": None, "meaning": "unknown"}
        prefixes, rem = self.strip_prefixes(word)
        analysis["prefixes"] = prefixes
        suffixes, root = self.strip_suffixes(rem)
        analysis["suffixes"] = suffixes
        analysis["root"] = root
        analysis["lemma"] = root
        analysis["meaning"] = self.ROOTS.get(root, "unknown")
        return analysis


# --- Noun-class (mipanda) data from your notes (concise mapping) ---
# This maps common surface prefixes to noun class numbers/labels and a short description.
NOUN_CLASS_PREFIXES = {
    # class : [prefixes...]
    1:  {"prefixes": ["mu","mw"],    "label": "Mupanda 1 (person singular)"},
    1.1:{"prefixes": ["Ø"],          "label": "Mupanda 1a (proper nouns / null prefix)"},
    2:  {"prefixes": ["va","v"],     "label": "Mupanda 2 (people plural)"},
    2.1:{"prefixes": ["va","a"],     "label": "Mupanda 2a (Manyika a/va mismatch)"},
    2.2:{"prefixes": ["a"],          "label": "Mupanda 2b (Zezuru a-)"},
    3:  {"prefixes": ["m","mw"],     "label": "Mupanda 3 (various things, some mass nouns)"},
    4:  {"prefixes": ["mi"],         "label": "Mupanda 4 (plural of 3)"},
    5:  {"prefixes": ["ri","z"],     "label": "Mupanda 5 (some singulars like 'ziso')"},
    6:  {"prefixes": ["ma"],         "label": "Mupanda 6 (plural of 5)"},
    7:  {"prefixes": ["chi","ch"],   "label": "Mupanda 7 (instrumental/augmentative etc.)"},
    8:  {"prefixes": ["zvi","zv"],   "label": "Mupanda 8 (plural of 7)"},
    9:  {"prefixes": ["N","n","mb","m","h"], "label": "Mupanda 9 (class with nasal/zero prefix: imba, mbudzi, huni)"},
    10: {"prefixes": ["dzi","dz"],   "label": "Mupanda 10 (plural of 9)"},
    11: {"prefixes": ["ru","rw"],    "label": "Mupanda 11 (abstracts/collectives like 'ruoko')"},
    12: {"prefixes": ["ka"],         "label": "Mupanda 12 (diminutive)"},
    13: {"prefixes": ["tu","tw"],    "label": "Mupanda 13 (plural diminutive)"},
    14: {"prefixes": ["u","hu","hw"],"label": "Mupanda 14 (abstracts, states)"},
    15: {"prefixes": ["ku"],         "label": "Mupanda 15 (infinitive nouns, verb nouns)"},
    16: {"prefixes": ["pa"],         "label": "Mupanda 16 (locative 'at')"},
    17: {"prefixes": ["ku"],         "label": "Mupanda 17 (locative 'to')"},
    17.1:{"prefixes":["Ø"],          "label": "Mupanda 17a (locative null prefix forms)"},
    18: {"prefixes": ["mu"],         "label": "Mupanda 18 (in/on locative 'in')"},
    19: {"prefixes": ["sv","svi"],   "label": "Mupanda 19 (Karanga forms)"},
    21: {"prefixes": ["zi","z"],     "label": "Mupanda 21 (augmentative / growth)"},
}

# Build a reverse lookup: prefix -> class number & label
PREFIX_TO_CLASS = {}
for num, info in NOUN_CLASS_PREFIXES.items():
    for p in info["prefixes"]:
        # normalize to lowercase and canonical form
        PREFIX_TO_CLASS[p.lower()] = (num, info["label"])


def detect_noun_class_from_prefix(prefixes: List[str]) -> (Optional[float], Optional[str]):
    """
    Given a list of prefixes detected (in order), attempt to map the first/narrowest
    prefix to a noun class. We check longest prefix first.
    """
    if not prefixes:
        return None, None
    # try all prefixes in order: pick longest match
    for pref in sorted(prefixes, key=lambda s: -len(s)):
        key = pref.lower()
        if key in PREFIX_TO_CLASS:
            num, label = PREFIX_TO_CLASS[key]
            return num, label
        # try small-normalizations
        if key.startswith("m") and "m" in PREFIX_TO_CLASS:
            return PREFIX_TO_CLASS["m"]
    return None, None


# --- spaCy component ---
class ShonaMorphComponent:
    name = "shona_morph"

    def __init__(self, nlp=None, mode: str = "baseline"):
        self.nlp = nlp
        self.analyzer = ShonaAnalyzerSimple(mode=mode)
        # register token extension to hold the analysis dict
        if not Token.has_extension("shona"):
            Token.set_extension("shona", default=None)

    def __call__(self, doc: Doc) -> Doc:
        for token in doc:
            txt = token.text
            # analyze token form
            analysis = self.analyzer.analyze_word(txt)
            # detect noun class from prefixes if token appears nominal (simple heuristic)
            noun_class_num, noun_class_label = detect_noun_class_from_prefix(analysis["prefixes"])
            analysis["noun_class_num"] = noun_class_num
            analysis["noun_class_label"] = noun_class_label
            # set extension
            token._.shona = analysis
        return doc


# Convenience factory for spaCy pipeline registration (if you want to use @nlp.add_pipe)
def make_shona_morph_component(nlp, name="shona_morph", mode="baseline"):
    return ShonaMorphComponent(nlp=nlp, mode=mode)


# --- Example usage ---
if __name__ == "__main__":
    # pip install spacy
    nlp = spacy.blank("en")   # we only use tokenizer and token objects; language doesn't matter
    # create component and add to pipeline
    comp = ShonaMorphComponent(nlp=nlp, mode="baseline")
    nlp.add_pipe(comp, name=comp.name, last=True)

    examples = [
        "murume", "vanhu", "mwana", "vietete", "mugoti", "moto", "miri", "ziso", "dzimba", "ruoko",
        "kuzasi", "pachivanze", "mugoti"
    ]

    for w in examples:
        doc = nlp(w)
        t = doc[0]
        print(f"\nWORD: {w}")
        for k, v in t._.shona.items():
            print(f"  {k}: {v}")
