In [None]:
!pip install sentencepiece spacy




In [None]:
import sentencepiece as spm

# Example Shona + slang sentences
with open("shona.txt", "w") as f:
    f.write("ndichakupai mari mangwana\n")
    f.write("handisi kuda\n")
    f.write("ndiri kufara big time\n")

# Train SentencePiece (unigram model, vocab 100)
spm.SentencePieceTrainer.Train(
    input="shona.txt", model_prefix="shona_sp", vocab_size=30, model_type="unigram"
)

# Load tokenizer
sp = spm.SentencePieceProcessor(model_file="shona_sp.model")

# Tokenize example
print(sp.encode("ndichakupai", out_type=str))

['▁ndi', 'c', 'ha', 'ku', 'p', 'a', 'i']


In [None]:
def shona_lemmatizer(word):
    # common subject prefixes
    prefixes = ["ndi", "ha", "va", "a", "ta"]
    suffixes = ["i", "sa", "wa"]

    lemma = word
    for p in prefixes:
        if lemma.startswith(p):
            lemma = lemma[len(p):]
    for s in suffixes:
        if lemma.endswith(s):
            lemma = lemma[:-len(s)]
    return lemma

# Example
print("Word: ndichakupai -> Lemma:", shona_lemmatizer("ndichakupai"))
print("Word: handisi -> Lemma:", shona_lemmatizer("handisi"))
print("Word: kufara -> Lemma:", shona_lemmatizer("kufara"))


Word: ndichakupai -> Lemma: chakupa
Word: handisi -> Lemma: ndis
Word: kufara -> Lemma: kufara


In [None]:
import pandas as pd
filepath = '/content/slang_dataset_with_intent.csv'
df = pd.read_csv(filepath)
df.head()

Unnamed: 0,message,is_code_mixed,sentiment,context,greeting_info,has_greeting,exact_slang,has_exact_slang,fuzzy_slang,has_fuzzy_slang,intent
0,Media omitted,True,unknown,general,[],False,[],False,[],False,chitchat
1,Hie Swit Mom,True,unknown,greeting,[],False,[],False,[],False,chitchat
2,Hi my dolliehow are u,True,unknown,greeting,"[{'term': 'hi', 'meaning': 'Hi (code-mixed)', ...",True,[],False,"['hi', 'u']",True,chitchat
3,By Gods grace am good Amai mwana venyu bhooooo...,True,unknown,general,[],False,[],False,"['am', 'bhooooooo']",True,religion
4,Sleep well everybody,True,unknown,general,[],False,[],False,[],False,chitchat


In [None]:
# Example seed dictionary for common Shona verbs/nouns
lemma_dict = {
    "ndichakupai": {"lemma": "kupa", "meaning": "give", "features": "Tense=Fut, Person=1, Number=Sing, Object=2Pl"},
    "handisi": {"lemma": "si", "meaning": "not / am not", "features": "Polarity=Neg, Person=1, Number=Sing"},
    "kuenda": {"lemma": "enda", "meaning": "go", "features": "VerbForm=Inf"},
    "kufara": {"lemma": "fara", "meaning": "be happy", "features": "VerbForm=Inf"},
    "muHarare": {"lemma": "Harare", "meaning": "Harare (city)", "features": "Locative"},
}


In [None]:
def shona_lemmatizer(word):
    # Check dictionary first
    if word in lemma_dict:
        return lemma_dict[word]

    # Rule-based stripping (very simple, expand as needed)
    prefixes = ["ndi", "ha", "va", "ma", "ku", "mu"]
    suffixes = ["i", "sa", "wa", "swa"]

    lemma = word
    for p in prefixes:
        if lemma.startswith(p) and len(lemma) > len(p)+2:
            lemma = lemma[len(p):]
    for s in suffixes:
        if lemma.endswith(s) and len(lemma) > len(s)+2:
            lemma = lemma[:-len(s)]

    return {"lemma": lemma, "meaning": "unknown", "features": "unknown"}


In [None]:
import re

# Tokenize simple (split by spaces, remove punctuation)
def tokenize(text):
    return re.findall(r"\w+", str(text).lower())

# Apply lemmatizer
df["tokens"] = df["message"].apply(tokenize)
df["lemmas"] = df["tokens"].apply(lambda words: [shona_lemmatizer(w) for w in words])

# Show sample
df[["message", "tokens", "lemmas"]].head(5)


Unnamed: 0,message,tokens,lemmas
0,Media omitted,"[media, omitted]","[{'lemma': 'media', 'meaning': 'unknown', 'fea..."
1,Hie Swit Mom,"[hie, swit, mom]","[{'lemma': 'hie', 'meaning': 'unknown', 'featu..."
2,Hi my dolliehow are u,"[hi, my, dolliehow, are, u]","[{'lemma': 'hi', 'meaning': 'unknown', 'featur..."
3,By Gods grace am good Amai mwana venyu bhooooo...,"[by, gods, grace, am, good, amai, mwana, venyu...","[{'lemma': 'by', 'meaning': 'unknown', 'featur..."
4,Sleep well everybody,"[sleep, well, everybody]","[{'lemma': 'sleep', 'meaning': 'unknown', 'fea..."


In [None]:
# Example pseudo-rules for verbs
lookup verbs = {
    gara:sit, famba:walk, taura:speak,bika :cook
}
rules = {
    "ku": {"VERB": "strip 'ku' "},    return base word
    eg kufamba -> ku + famba

}

In [1]:
# Inflectional prefixes (subject, tense, negation, noun classes)
INFLECTIONAL_PREFIXES = [
    "ndi", "va", "ha", "ta", "ma", "chi", "zvi", "ru", "ka", "tu", "hu", "ku", "pa", "mu", "ri", "sa", "se", "yo"
]

# Derivational suffixes (extensions, intensifiers, terminal vowels)
DERIVATIONAL_SUFFIXES = [
    "a", "i", "e", "o",   # terminal vowels
    "tu", "ana", "sa",   # adjectival intensifiers
    "is", "ir", "er", "unur", "an", "w",  # verb extensions
]

# Example lexicon of roots
ROOTS = {
    "famb": "walk",
    "gar": "sit/stay",
    "tuk": "scold",
    "bik": "cook",
    "sung": "tie",
    "dy": "eat",
}


In [2]:
def analyze_shona_word(word):
    analysis = {"word": word, "prefixes": [], "root": word, "suffixes": []}

    # Step 1: strip prefixes
    for pref in INFLECTIONAL_PREFIXES:
        if analysis["root"].startswith(pref) and len(analysis["root"]) > len(pref)+2:
            analysis["prefixes"].append(pref)
            analysis["root"] = analysis["root"][len(pref):]

    # Step 2: strip suffixes
    for suf in DERIVATIONAL_SUFFIXES:
        if analysis["root"].endswith(suf) and len(analysis["root"]) > len(suf)+1:
            analysis["suffixes"].append(suf)
            analysis["root"] = analysis["root"][:-len(suf)]

    # Step 3: map root to meaning
    lemma = ROOTS.get(analysis["root"], analysis["root"])
    analysis["lemma"] = analysis["root"]
    analysis["meaning"] = lemma

    return analysis


In [3]:
examples = ["ndichafamba", "handisati", "tukana", "sungunura", "mukomana", "makuru"]

for w in examples:
    print(analyze_shona_word(w))


{'word': 'ndichafamba', 'prefixes': ['ndi'], 'root': 'chafamb', 'suffixes': ['a'], 'lemma': 'chafamb', 'meaning': 'chafamb'}
{'word': 'handisati', 'prefixes': ['ha'], 'root': 'ndisat', 'suffixes': ['i'], 'lemma': 'ndisat', 'meaning': 'ndisat'}
{'word': 'tukana', 'prefixes': ['tu'], 'root': 'kan', 'suffixes': ['a'], 'lemma': 'kan', 'meaning': 'kan'}
{'word': 'sungunura', 'prefixes': [], 'root': 'sung', 'suffixes': ['a', 'unur'], 'lemma': 'sung', 'meaning': 'tie'}
{'word': 'mukomana', 'prefixes': ['mu'], 'root': 'kom', 'suffixes': ['a', 'an'], 'lemma': 'kom', 'meaning': 'kom'}
{'word': 'makuru', 'prefixes': ['ma'], 'root': 'kuru', 'suffixes': [], 'lemma': 'kuru', 'meaning': 'kuru'}


In [4]:
# Inflectional prefixes (from Mkanganwi, Table 2 + noun classes)
INFLECTIONAL_PREFIXES = [
    "ha", "ndi", "va", "ti", "ri", "ku", "mu", "chi", "zvi", "ru", "ma", "pa", "sa", "se", "yo"
]

# Derivational suffixes (verbal extensions, final vowels, adjectival)
DERIVATIONAL_SUFFIXES = [
    "a", "i", "e", "o",      # terminal vowels
    "an", "ana", "sa", "tu", # adjectival derivation
    "is", "ir", "er", "ur", "unur", # causatives/applicatives
    "w", "iw", "irw"         # passive forms
]

# Example root lexicon
ROOTS = {
    "tuk": "scold",
    "famb": "walk",
    "gar": "sit/stay",
    "sung": "tie",
    "bik": "cook",
    "nzwa": "hear/feel",
    "da": "love"
}

def mkanganwi_analyzer(word):
    analysis = {"word": word, "prefixes": [], "root": word, "suffixes": []}

    # 1. Strip prefixes (inflectional)
    for pref in INFLECTIONAL_PREFIXES:
        if analysis["root"].startswith(pref) and len(analysis["root"]) > len(pref) + 2:
            analysis["prefixes"].append(pref)
            analysis["root"] = analysis["root"][len(pref):]

    # 2. Strip suffixes (derivational)
    for suf in DERIVATIONAL_SUFFIXES:
        if analysis["root"].endswith(suf) and len(analysis["root"]) > len(suf) + 1:
            analysis["suffixes"].append(suf)
            analysis["root"] = analysis["root"][:-len(suf)]

    # 3. Map to meaning
    analysis["lemma"] = analysis["root"]
    analysis["meaning"] = ROOTS.get(analysis["root"], "unknown")

    return analysis

# Test examples (from Mkanganwi’s paper)
examples = [
    "ndichafamba",   # I will walk
    "handituki",     # I do not scold
    "sungunura",     # untie
    "tukana",        # scold each other
    "mugariri",      # resident/worker
    "makuru",        # big ones
]

for w in examples:
    print(mkanganwi_analyzer(w))


{'word': 'ndichafamba', 'prefixes': ['ndi'], 'root': 'chafamb', 'suffixes': ['a'], 'lemma': 'chafamb', 'meaning': 'unknown'}
{'word': 'handituki', 'prefixes': ['ha', 'ndi'], 'root': 'tuk', 'suffixes': ['i'], 'lemma': 'tuk', 'meaning': 'scold'}
{'word': 'sungunura', 'prefixes': [], 'root': 'sungun', 'suffixes': ['a', 'ur'], 'lemma': 'sungun', 'meaning': 'unknown'}
{'word': 'tukana', 'prefixes': [], 'root': 'tuk', 'suffixes': ['a', 'an'], 'lemma': 'tuk', 'meaning': 'scold'}
{'word': 'mugariri', 'prefixes': ['mu'], 'root': 'gar', 'suffixes': ['i', 'ir'], 'lemma': 'gar', 'meaning': 'sit/stay'}
{'word': 'makuru', 'prefixes': ['ma'], 'root': 'kuru', 'suffixes': [], 'lemma': 'kuru', 'meaning': 'unknown'}


In [None]:
import re

def shona_tokenizer(text: str):
    """Basic tokenizer: split on spaces & punctuation"""
    return re.findall(r"\w+", text.lower())

# Test
print(shona_tokenizer("Ndichakupai chikafu mangwana."))
# ['ndichakupai', 'chikafu', 'mangwana']


In [None]:
INFLECTIONAL_PREFIXES = ["ndi", "va", "ha", "ta", "ma", "chi", "zvi", "ru", "ka", "tu", "hu", "ku", "pa", "mu", "ri"]
DERIVATIONAL_SUFFIXES = ["a", "i", "e", "o", "an", "ana", "sa", "tu", "is", "ir", "er", "unur", "w"]

ROOTS = {
    "famb": {"lemma": "famba", "pos": "VERB"},
    "tuk": {"lemma": "tuka", "pos": "VERB"},
    "dy": {"lemma": "dya", "pos": "VERB"},
    "gar": {"lemma": "gara", "pos": "VERB"},
    "mari": {"lemma": "mari", "pos": "NOUN"},
    "chikafu": {"lemma": "chikafu", "pos": "NOUN"},
    "mangwana": {"lemma": "mangwana", "pos": "NOUN"},
}

def shona_lemmatizer(word: str):
    # dictionary first
    if word in ROOTS:
        return ROOTS[word]["lemma"]

    root = word
    # strip prefixes
    for p in INFLECTIONAL_PREFIXES:
        if root.startswith(p) and len(root) > len(p) + 2:
            root = root[len(p):]
            break

    # strip suffixes
    for s in DERIVATIONAL_SUFFIXES:
        if root.endswith(s) and len(root) > len(s) + 1:
            root = root[:-len(s)]
            break

    return ROOTS.get(root, {"lemma": root}).get("lemma", root)

# Test
print(shona_lemmatizer("ndichafamba"))   # → "famba"
print(shona_lemmatizer("handituki"))     # → "tuka"
print(shona_lemmatizer("mangwana"))      # → "mangwana"


In [None]:
def shona_pos_tagger(tokens):
    tagged = []
    for w in tokens:
        lemma = shona_lemmatizer(w)
        # lookup dictionary
        if lemma in [ROOTS[k]["lemma"] for k in ROOTS]:
            pos = [ROOTS[k]["pos"] for k in ROOTS if ROOTS[k]["lemma"] == lemma][0]
        else:
            # fallback heuristic: nouns often start with "chi-", "mu-", "ma-"
            if w.startswith(("chi", "mu", "ma")):
                pos = "NOUN"
            elif w.startswith(("ndi", "ha", "va", "ta")):
                pos = "VERB"
            else:
                pos = "X"
        tagged.append((w, lemma, pos))
    return tagged

# Test
sentence = "Ndichakupai chikafu mangwana"
tokens = shona_tokenizer(sentence)
print(shona_pos_tagger(tokens))


In [None]:
def shona_pipeline(text):
    tokens = shona_tokenizer(text)
    return shona_pos_tagger(tokens)

# Example
print(shona_pipeline("Handisi kudya mangwana"))


In [None]:
import spacy
from spacy.language import Language
from spacy.tokens import Doc, Token

# Register custom extensions if needed
Token.set_extension("shona_lemma", default=None, force=True)

@Language.factory("shona_lemmatizer")
def create_shona_lemmatizer(nlp, name):
    return ShonaLemmatizer()

class ShonaLemmatizer:
    def __init__(self):
        pass

    def __call__(self, doc: Doc):
        for token in doc:
            token._.shona_lemma = shona_lemmatizer(token.text)
        return doc


In [None]:
from spacy.util import compile_infix_regex
import re

def shona_tokenizer(nlp):
    # simple whitespace tokenizer
    return spacy.tokenizer.Tokenizer(nlp.vocab, token_match=re.compile(r'\w+').match)

# Create blank Shona pipeline
nlp = spacy.blank("xx")   # xx = custom, since Shona not built in
nlp.tokenizer = shona_tokenizer(nlp)

# Add custom components
nlp.add_pipe("shona_lemmatizer", last=True)


In [None]:
doc = nlp("Ndichakupai chikafu mangwana.")

for token in doc:
    print(f"Token: {token.text:<12} Lemma: {token._.shona_lemma}")


In [None]:
shona_spacy/
  __init__.py
  tokenizer.py
  lemmatizer.py
  pos_tagger.py
  pipeline.py
setup.py


In [None]:
from setuptools import setup, find_packages

setup(
    name="shona_spacy",
    version="0.1.0",
    packages=find_packages(),
    install_requires=["spacy>=3.0"],
    entry_points={"spacy_languages": ["shona = shona_spacy.pipeline:load_shona"]},
)


In [None]:
pip install shona_spacy


In [None]:
import spacy
nlp = spacy.load("shona")
doc = nlp("Handisi kudya mangwana")


In [1]:
# ==============================
# Shona Morphological Analyzer
# Baseline vs Mkanganwi
# ==============================

class ShonaAnalyzer:
    def __init__(self, mode="baseline"):
        self.mode = mode
        if mode == "baseline":
            self.INFLECTIONAL_PREFIXES = [
                "ndi","va","ha","ta","ma","chi","zvi","ru","ka","tu","hu","ku","pa","mu","ri","sa","se","yo"
            ]
            self.DERIVATIONAL_SUFFIXES = [
                "a","i","e","o","tu","ana","sa","is","ir","er","unur","an","w"
            ]
            self.ROOTS = {
                "famb":"walk",
                "gar":"sit/stay",
                "tuk":"scold",
                "bik":"cook",
                "sung":"tie",
                "dy":"eat",
            }
        elif mode == "mkanganwi":
            self.INFLECTIONAL_PREFIXES = [
                "ha","ndi","va","ti","ri","ku","mu","chi","zvi","ru","ma","pa","sa","se","yo"
            ]
            self.DERIVATIONAL_SUFFIXES = [
                "a","i","e","o","an","ana","sa","tu",
                "is","ir","er","ur","unur","w","iw","irw"
            ]
            self.ROOTS = {
                "tuk":"scold",
                "famb":"walk",
                "gar":"sit/stay",
                "sung":"tie",
                "bik":"cook",
                "nzwa":"hear/feel",
                "da":"love",
            }
        else:
            raise ValueError("Mode must be 'baseline' or 'mkanganwi'.")

    def analyze(self, word):
        analysis = {"word": word, "prefixes": [], "root": word, "suffixes": []}

        # Step 1: strip prefixes
        for pref in self.INFLECTIONAL_PREFIXES:
            if analysis["root"].startswith(pref) and len(analysis["root"]) > len(pref) + 2:
                analysis["prefixes"].append(pref)
                analysis["root"] = analysis["root"][len(pref):]
                break  # stop after first match

        # Step 2: strip suffixes
        for suf in self.DERIVATIONAL_SUFFIXES:
            if analysis["root"].endswith(suf) and len(analysis["root"]) > len(suf) + 1:
                analysis["suffixes"].append(suf)
                analysis["root"] = analysis["root"][:-len(suf)]
                break  # stop after first match

        # Step 3: map to meaning if known
        lemma = analysis["root"]
        meaning = self.ROOTS.get(analysis["root"], "unknown")
        analysis["lemma"] = lemma
        analysis["meaning"] = meaning

        return analysis


In [2]:
words = ["ndichafamba", "handituki", "sungunura", "tukana", "mugariri", "makuru"]

# Baseline analyzer
baseline_analyzer = ShonaAnalyzer(mode="baseline")
print("\n--- Baseline ---")
for w in words:
    print(baseline_analyzer.analyze(w))

# Mkanganwi analyzer
mkanganwi_analyzer = ShonaAnalyzer(mode="mkanganwi")
print("\n--- Mkanganwi ---")
for w in words:
    print(mkanganwi_analyzer.analyze(w))



--- Baseline ---
{'word': 'ndichafamba', 'prefixes': ['ndi'], 'root': 'chafamb', 'suffixes': ['a'], 'lemma': 'chafamb', 'meaning': 'unknown'}
{'word': 'handituki', 'prefixes': ['ha'], 'root': 'ndituk', 'suffixes': ['i'], 'lemma': 'ndituk', 'meaning': 'unknown'}
{'word': 'sungunura', 'prefixes': [], 'root': 'sungunur', 'suffixes': ['a'], 'lemma': 'sungunur', 'meaning': 'unknown'}
{'word': 'tukana', 'prefixes': ['tu'], 'root': 'kan', 'suffixes': ['a'], 'lemma': 'kan', 'meaning': 'unknown'}
{'word': 'mugariri', 'prefixes': ['mu'], 'root': 'garir', 'suffixes': ['i'], 'lemma': 'garir', 'meaning': 'unknown'}
{'word': 'makuru', 'prefixes': ['ma'], 'root': 'kuru', 'suffixes': [], 'lemma': 'kuru', 'meaning': 'unknown'}

--- Mkanganwi ---
{'word': 'ndichafamba', 'prefixes': ['ndi'], 'root': 'chafamb', 'suffixes': ['a'], 'lemma': 'chafamb', 'meaning': 'unknown'}
{'word': 'handituki', 'prefixes': ['ha'], 'root': 'ndituk', 'suffixes': ['i'], 'lemma': 'ndituk', 'meaning': 'unknown'}
{'word': 'sungu