In [None]:
import pandas as pd
import re
from collections import Counter

# Load your chat dataset (adjust file path/column name)
df = pd.read_csv("shona_chats.csv")   # assume your file
messages = df["message"].dropna().tolist()

# 1. Tokenizer
def tokenize(text):
    return re.findall(r"\w+", str(text).lower())

tokens = []
for msg in messages:
    tokens.extend(tokenize(msg))

print("Total tokens:", len(tokens))
print("Unique tokens:", len(set(tokens)))

# 2. Count frequency
freq = Counter(tokens)
print("Top 20 words:")
print(freq.most_common(20))

# 3. Simple prefix/suffix stripping (candidate root extraction)
INFLECTIONAL_PREFIXES = ["ndi", "va", "ha", "ta", "ma", "chi", "zvi", "ru", "ka", "tu", "hu", "ku", "pa", "mu", "ri"]
DERIVATIONAL_SUFFIXES = ["a", "i", "e", "o", "an", "ana", "sa", "tu", "is", "ir", "er", "ur", "unur", "w", "iw", "irw"]

def extract_root(word):
    root = word
    for p in INFLECTIONAL_PREFIXES:
        if root.startswith(p) and len(root) > len(p)+2:
            root = root[len(p):]
            break
    for s in DERIVATIONAL_SUFFIXES:
        if root.endswith(s) and len(root) > len(s)+1:
            root = root[:-len(s)]
            break
    return root

# Apply to most frequent words
roots = [(w, freq[w], extract_root(w)) for w in list(freq.keys())[:500]]

# 4. Save to CSV for manual annotation
roots_df = pd.DataFrame(roots, columns=["word_form", "count", "candidate_root"])
roots_df.to_csv("shona_candidate_roots.csv", index=False)

print("Candidate roots saved to shona_candidate_roots.csv")
