In [3]:
!pip install langdetect
!pip install fuzzywuzzy
from langdetect import detect, LangDetectException
from textblob import TextBlob
import re
import pandas as pd
from fuzzywuzzy import fuzz, process
from typing import Dict, List, Optional, Set

# Load the dataset
filepath = '/content/shona_combined_dataset.csv'
df = pd.read_csv(filepath)

greeting_dict = {
    "madii": {
        "meaning": "How are you?",
        "context": "start_conversation",
        "tone": "casual",
        "variant_of": "makadini"
    },
    "wadii": {
        "meaning": "How are you?",
        "context": "start_conversation",
        "tone": "casual",
        "variant_of": "wakadini"
    },
    "makadini": {
        "meaning": "How are you? (plural/formal)",
        "context": "start_conversation",
        "tone": "formal",
        "variant_of": None
    },
    "ndeipi": {
        "meaning": "What's up?",
        "context": "start_conversation",
        "tone": "slang",
        "variant_of": None
    },
    "mangwanani": {
        "meaning": "Good morning",
        "context": "start_conversation",
        "tone": "formal",
        "variant_of": None
    },
    "maswera sei": {
        "meaning": "Good afternoon",
        "context": "start_conversation",
        "tone": "formal",
        "variant_of": "maswera"
    },
    "manheru": {
        "meaning": "Good evening",
        "context": "start_conversation",
        "tone": "formal",
        "variant_of": None
    },
    "mukuita sei": {
        "meaning": "How are you? (plural)",
        "context": "start_conversation",
        "tone": "casual",
        "variant_of": "muri sei"
    },
    "bho here": {
        "meaning": "What's up?",
        "context": "start_conversation",
        "tone": "slang",
        "variant_of": "bho"
    },
    "safe here": {
        "meaning": "What’s going on?",
        "context": "check_in",
        "tone": "slang",
        "variant_of": "safe"
    },
    "hoyoo": {
        "meaning": "What's going on?",
        "context": "check_in",
        "tone": "casual",
        "variant_of": None
    },
    "mhoro": {
        "meaning": "Hello!",
        "context": "start_conversation",
        "tone": "neutral",
        "variant_of": None
    },
    "hesi": {
        "meaning": "Hi!",
        "context": "start_conversation",
        "tone": "casual",
        "variant_of": None
    },
    "hi": {
        "meaning": "Hi (code-mixed)",
        "context": "start_conversation",
        "tone": "code_mixed",
        "variant_of": 'hie'
    },
    "hello": {
        "meaning": "Hello (code-mixed)",
        "context": "start_conversation",
        "tone": "code_mixed",
        "variant_of": None
    },
    "hey": {
        "meaning": "Hey (code-mixed)",
        "context": "start_conversation",
        "tone": "code_mixed",
        "variant_of": None
    },
    "makadii henyu": {
        "meaning": "Very polite greeting",
        "context": "start_conversation",
        "tone": "formal",
        "variant_of": "makadini"
    },
    "tiripo": {
        "meaning": "We’re good (response)",
        "context": "response",
        "tone": "neutral",
        "variant_of": None
    },
    "tiri bho": {
        "meaning": "We're fine (slang)",
        "context": "response",
        "tone": "slang",
        "variant_of": "bho"
    },
    "masikati": {
        "meaning": "Good afternoon",
        "context": "start_conversation",
        "tone": "formal",
        "variant_of": None
    },
    "mamukasei": {
        "meaning": "How did you wake up?",
        "context": "start_conversation",
        "tone": "formal",
        "variant_of": "mamuka"
    },
    "mamuka": {
        "meaning": "Did you wake up well?",
        "context": "start_conversation",
        "tone": "formal",
        "variant_of": None
    },
    "maswera": {
        "meaning": "How was your day?",
        "context": "start_conversation",
        "tone": "formal",
        "variant_of": "maswera sei"
    },
    "zviri cei": {
        "meaning": "How are things?",
        "context": "check_in",
        "tone": "slang",
        "variant_of": None
    },
    "uri sei": {
        "meaning": "How are you?",
        "context": "start_conversation",
        "tone": "casual",
        "variant_of": None
    },
    "makasimba": {
        "meaning": "Are you well?",
        "context": "start_conversation",
        "tone": "formal",
        "variant_of": None
    },
    "rugare": {
        "meaning": "Peaceful greeting",
        "context": "start_conversation",
        "tone": "formal",
        "variant_of": None
    },
    "mushe": {
        "meaning": "Wellness greeting",
        "context": "start_conversation",
        "tone": "neutral",
        "variant_of": None
    }
}






def get_greeting_info(message: str, greeting_dict: Dict) -> List[Dict]:
    """Extract greeting information from a message."""
    if not isinstance(message, str):
        return []
    found = []
    for word in message.lower().split():
        if word in greeting_dict:
            info = greeting_dict[word]
            found.append({
                "term": word,
                "meaning": info["meaning"],
                "context": info["context"],
                "tone": info["tone"],
                "variant_of": info["variant_of"]
            })
    return found

df['greeting_info'] = df['message'].apply(lambda x: get_greeting_info(x, greeting_dict))

slang_dict = {
    "bho": {
        "meaning": "cool",
        "category": "expression",
        "tone": "casual",
        "part_of_speech": "adjective",
        "variant_of": None
    },
    "mudhara": {
        "meaning": "elder/father",
        "category": "person",
        "tone": "respectful",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "jahman": {
        "meaning": "bro",
        "category": "person",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": "blaz"
    },
    "bag": {
        "meaning": "money",
        "category": "finance",
        "tone": "slang",
        "part_of_speech": "noun",
        "variant_of": "mula"
    },
    "bharanzi": {
        "meaning": "fool",
        "category": "verb",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "murungu": {
        "meaning": "boss",
        "category": "person",
        "tone": "formal",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "chigunduru": {
        "meaning": "street kid",
        "category": "person",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "chigulani": {
        "meaning": "street kid",
        "category": "person",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "bhileki": {
        "meaning": "bar",
        "category": "place",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "chuna": {
        "meaning": "tell",
        "category": "verb",
        "tone": "casual",
        "part_of_speech": "verb",
        "variant_of": None
    },
    "danda": {
        "meaning": "drug",
        "category": "drug",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "dhipisa": {
        "meaning": "complicate",
        "category": "verb",
        "tone": "casual",
        "part_of_speech": "verb",
        "variant_of": None
    },
    "dhoma": {
        "meaning": "contemplate",
        "category": "verb",
        "tone": "casual",
        "part_of_speech": "verb",
        "variant_of": None
    },
    "hant": {
        "meaning": "okay? / right?",
        "category": "expression",
        "tone": "casual",
        "part_of_speech": "interjection",
        "variant_of": None
    },
    "bigman": {
        "meaning": "bro",
        "category": "person",
        "tone": "respectful",
        "part_of_speech": "noun",
        "variant_of": "blaz"
    },
    "amana": {
        "meaning": "guys",
        "category": "group",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "hoyoo": {
        "meaning": "cool",
        "category": "expression",
        "tone": "casual",
        "part_of_speech": "adjective",
        "variant_of": "bho"
    },
    "murudo": {
        "meaning": "no hard feelings",
        "category": "emotion",
        "tone": "reassuring",
        "part_of_speech": "expression",
        "variant_of": None
    },
    "sinhi": {
        "meaning": "things",
        "category": "object",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "sinde"
    },
    "sinde": {
        "meaning": "things",
        "category": "object",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "sinhi"
    },
    "lapaz": {
        "meaning": "laptop",
        "category": "tech",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "laptop"
    },
    "momz": {
        "meaning": "mom",
        "category": "family",
        "tone": "affectionate",
        "part_of_speech": "noun",
        "variant_of": "mother"
    },
    "graft": {
        "meaning": "job",
        "category": "work",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "job"
    },
    "muripapi": {
        "meaning": "Where are you?",
        "category": "location",
        "tone": "neutral",
        "part_of_speech": "question",
        "variant_of": None
    },
    "ngomwa": {
        "meaning": "drug",
        "category": "substance",
        "tone": "informal",
        "part_of_speech": "noun",
        "variant_of": "chamba"
    },
    "jahman": {
        "meaning": "bro",
        "category": "person",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": "blaz"
    },
    "kundifendera": {
        "meaning": "provoke",
        "category": "emotion",
        "tone": "aggressive",
        "part_of_speech": "verb",
        "variant_of": None
    },
    "pamhatso": {
        "meaning": "home",
        "category": "location",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "zayan"
    },
    "jabuna": {
        "meaning": "home",
        "category": "location",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": "zayan"
    },
    "den": {
        "meaning": "house",
        "category": "location",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "home"
    },
    "zayan": {
        "meaning": "home",
        "category": "location",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "blaz": {
        "meaning": "bro",
        "category": "person",
        "tone": "slang",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "safe": {
        "meaning": "okay",
        "category": "expression",
        "tone": "casual",
        "part_of_speech": "adjective",
        "variant_of": "bho"
    },
    "donhera": {
        "meaning": "arrive",
        "category": "movement",
        "tone": "neutral",
        "part_of_speech": "verb",
        "variant_of": "svika"
    },
    "melaz": {
        "meaning": "marriage",
        "category": "relationship",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "marriage"
    },
    "chuna": {
        "meaning": "tell",
        "category": "communication",
        "tone": "neutral",
        "part_of_speech": "verb",
        "variant_of": "taura"
    },
    "tipa": {
        "meaning": "tip",
        "category": "money",
        "tone": "casual",
        "part_of_speech": "verb",
        "variant_of": "give"
    },
    "monaz": {
        "meaning": "morning",
        "category": "time",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": "mangwanani"
    },
    "skaten": {
        "meaning": "afternoon",
        "category": "time",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": "masikati"
    },
    "deepaz": {
        "meaning": "evening",
        "category": "time",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": "manheru"
    },
    "vhat": {
        "meaning": "water",
        "category": "object",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "mvura"
    },
    "madii": {
        "meaning": "Hi",
        "category": "greeting",
        "tone": "casual",
        "part_of_speech": "expression",
        "variant_of": "makadini"
    },
    "mukuita sei": {
        "meaning": "How is it going?",
        "category": "greeting",
        "tone": "casual",
        "part_of_speech": "expression",
        "variant_of": "uri sei"
    },
    "mati todii": {
        "meaning": "What’s the plan?",
        "category": "question",
        "tone": "informal",
        "part_of_speech": "expression",
        "variant_of": None
    },
    "kuchimhanya": {
        "meaning": "Do it",
        "category": "action",
        "tone": "motivational",
        "part_of_speech": "verb",
        "variant_of": "ita"
    },
    "hindava": {
        "meaning": "What’s wrong?",
        "category": "question",
        "tone": "concerned",
        "part_of_speech": "expression",
        "variant_of": None
    },
    "koso": {
        "meaning": "girl",
        "category": "person",
        "tone": "slang",
        "part_of_speech": "noun",
        "variant_of": "nyana"
    },
    "beda": {
        "meaning": "girl",
        "category": "person",
        "tone": "slang",
        "part_of_speech": "noun",
        "variant_of": "nyana"
    },
    "nyana": {
        "meaning": "girl",
        "category": "person",
        "tone": "slang",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "kubaya": {
        "meaning": "going",
        "category": "movement",
        "tone": "neutral",
        "part_of_speech": "verb",
        "variant_of": "kuenda"
    },
    "kusofter": {
        "meaning": "propose (romantically)",
        "category": "relationship",
        "tone": "casual",
        "part_of_speech": "verb",
        "variant_of": "propose"
    },
    "keta": {
        "meaning": "know",
        "category": "cognition",
        "tone": "neutral",
        "part_of_speech": "verb",
        "variant_of": "ziva"
    },
    "mbashto": {
        "meaning": "magic",
        "category": "expression",
        "tone": "playful",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "kusticker": {
        "meaning": "high (on drugs)",
        "category": "state",
        "tone": "slang",
        "part_of_speech": "verb",
        "variant_of": "kusvinura"
    },
    "pombi": {
        "meaning": "pretty",
        "category": "appearance",
        "tone": "admiring",
        "part_of_speech": "adjective",
        "variant_of": None
    },
    "bvrii": {
        "meaning": "lit / awesome",
        "category": "expression",
        "tone": "excited",
        "part_of_speech": "adjective",
        "variant_of": "bho"
    },
    "mota mota": {
        "meaning": "cool",
        "category": "expression",
        "tone": "slang",
        "part_of_speech": "adjective",
        "variant_of": "bho"
    },
    "makuruwane": {
        "meaning": "cool one",
        "category": "compliment",
        "tone": "friendly",
        "part_of_speech": "noun",
        "variant_of": "blaz"
    },
    "miswa": {
        "meaning": "stopped",
        "category": "action",
        "tone": "neutral",
        "part_of_speech": "verb",
        "variant_of": "kumira"
    },
    "bholato": {
        "meaning": "cool",
        "category": "expression",
        "tone": "slang",
        "part_of_speech": "adjective",
        "variant_of": "bho"
    },
    "dhipisa": {
        "meaning": "complicate",
        "category": "emotion",
        "tone": "frustrated",
        "part_of_speech": "verb",
        "variant_of": None
    },
    "chenga": {
        "meaning": "see",
        "category": "cognition",
        "tone": "neutral",
        "part_of_speech": "verb",
        "variant_of": "ona"
    },
    "murungu": {
        "meaning": "boss / white man",
        "category": "person",
        "tone": "respectful",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "maya": {
        "meaning": "no",
        "category": "response",
        "tone": "dismissive",
        "part_of_speech": "interjection",
        "variant_of": "mela"
    },
    "mela": {
        "meaning": "no",
        "category": "response",
        "tone": "neutral",
        "part_of_speech": "interjection",
        "variant_of": "maya"
    },
    "yesaya": {
        "meaning": "yes",
        "category": "response",
        "tone": "positive",
        "part_of_speech": "interjection",
        "variant_of": "yes"
    },
    "mujolo": {
        "meaning": "love / romantic relationship",
        "category": "relationship",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "bhebha": {
        "meaning": "burning (figuratively: strong emotion, lust)",
        "category": "emotion",
        "tone": "intense",
        "part_of_speech": "verb",
        "variant_of": None
    },
    "hwai": {
        "meaning": "coward",
        "category": "insult",
        "tone": "derogatory",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "mudhara": {
        "meaning": "elder / father",
        "category": "person",
        "tone": "respectful",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "chigunduru": {
        "meaning": "street kid",
        "category": "person",
        "tone": "derogatory",
        "part_of_speech": "noun",
        "variant_of": "chigulani"
    },
    "chigulani": {
        "meaning": "street kid",
        "category": "person",
        "tone": "derogatory",
        "part_of_speech": "noun",
        "variant_of": "chigunduru"
    },
    "gogaz": {
        "meaning": "granny",
        "category": "family",
        "tone": "affectionate",
        "part_of_speech": "noun",
        "variant_of": "gogo"
    },
    "ninez": {
        "meaning": "aunt",
        "category": "family",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "gulez"
    },
    "gulez": {
        "meaning": "aunt",
        "category": "family",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "ninez"
    },
    "khule": {
        "meaning": "uncle",
        "category": "family",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "muzaya": {
        "meaning": "nephew/niece",
        "category": "family",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "kelaz": {
        "meaning": "church",
        "category": "place",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "church"
    },
    "bhileki": {
        "meaning": "bar",
        "category": "place",
        "tone": "informal",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "mvoda": {
        "meaning": "phone",
        "category": "tech",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "foni"
    },
    "gomba": {
        "meaning": "hunger",
        "category": "need",
        "tone": "desperate",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "dhoma": {
        "meaning": "contemplate",
        "category": "cognition",
        "tone": "serious",
        "part_of_speech": "verb",
        "variant_of": None
    },
    "kuvhurika": {
        "meaning": "crazy",
        "category": "mental_state",
        "tone": "informal",
        "part_of_speech": "adjective",
        "variant_of": None
    },
    "boma": {
        "meaning": "jail",
        "category": "place",
        "tone": "negative",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "chamba": {
        "meaning": "drug",
        "category": "substance",
        "tone": "informal",
        "part_of_speech": "noun",
        "variant_of": "danda"
    },
    "jipela": {
        "meaning": "chips",
        "category": "food",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "danda": {
        "meaning": "drug",
        "category": "substance",
        "tone": "informal",
        "part_of_speech": "noun",
        "variant_of": "chamba"
    },
    "bag": {
        "meaning": "money",
        "category": "finance",
        "tone": "slang",
        "part_of_speech": "noun",
        "variant_of": "mula"
    },
    "mula": {
        "meaning": "money",
        "category": "finance",
        "tone": "slang",
        "part_of_speech": "noun",
        "variant_of": "bag"
    },
    "maObama": {
        "meaning": "money",
        "category": "finance",
        "tone": "playful",
        "part_of_speech": "noun",
        "variant_of": "mula"
    },
    "kita": {
        "meaning": "die",
        "category": "life_event",
        "tone": "serious",
        "part_of_speech": "verb",
        "variant_of": "gula"
    },
    "bharanzi": {
        "meaning": "fool",
        "category": "insult",
        "tone": "derogatory",
        "part_of_speech": "noun",
        "variant_of": "bhambi"
    },
    "bhambi": {
        "meaning": "fool",
        "category": "insult",
        "tone": "derogatory",
        "part_of_speech": "noun",
        "variant_of": "bharanzi"
    },
    "mudhonhi": {
        "meaning": "baby",
        "category": "person",
        "tone": "affectionate",
        "part_of_speech": "noun",
        "variant_of": "mucheche"
    },
    "ndai": {
        "meaning": "money",
        "category": "finance",
        "tone": "slang",
        "part_of_speech": "noun",
        "variant_of": "mula"
    },
    "float": {
        "meaning": "money",
        "category": "finance",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": "mula"
    },
    "kushta": {
        "meaning": "sleep",
        "category": "action",
        "tone": "casual",
        "part_of_speech": "verb",
        "variant_of": "kurara"
    },
    "gula": {
        "meaning": "die",
        "category": "life_event",
        "tone": "serious",
        "part_of_speech": "verb",
        "variant_of": "kita"
    },
    "kudzimwa": {
        "meaning": "screwed / messed up",
        "category": "state",
        "tone": "negative",
        "part_of_speech": "verb",
        "variant_of": None
    },
    "kugula": {
        "meaning": "sick",
        "category": "health",
        "tone": "serious",
        "part_of_speech": "verb",
        "variant_of": "kurwara"
    },
    "chi round": {
        "meaning": "errand",
        "category": "activity",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": "chi circle"
    },
    "chi circle": {
        "meaning": "errand",
        "category": "activity",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": "chi round"
    },
    "tonaz": {
        "meaning": "town",
        "category": "place",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "guta"
    },
    "den": {
        "meaning": "house",
        "category": "place",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "home"
    },
    "manake": {
        "meaning": "nice",
        "category": "expression",
        "tone": "positive",
        "part_of_speech": "adjective",
        "variant_of": None
    },
    "fazh": {
        "meaning": "school",
        "category": "place",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "chikoro"
    }
}



from fuzzywuzzy import process

def preprocess_text(text: str) -> str:
    """Preprocess text to handle Shona-specific characters and punctuation."""
    if not isinstance(text, str):
        return ""
    # Remove excessive punctuation, preserve spaces
    text = re.sub(r'[^\w\s]', ' ', text.lower())
    # Normalize Shona-specific characters (e.g., remove diacritics if needed)
    text = re.sub(r'[^\x00-\x7F]', lambda m: m.group(0).encode('ascii', 'ignore').decode('ascii'), text)
    return ' '.join(text.split())

def extract_slang_metadata(text: str, slang_dict: Dict, threshold: int = 87) -> List[Dict]:
    """
    Extract slang from text using fuzzy matching, including multi-word phrases.
    Returns list of dicts with: term, matched_slang, meaning, tone, category, variant_of.
    """
    if not isinstance(text, str):
        return []

    results = []
    text = preprocess_text(text)
    glossary = list(slang_dict.keys())

    # Split into words and check for multi-word phrases
    words = text.split()
    for i in range(len(words)):
        # Check single words and phrases up to 3 words
        for j in range(i, min(i + 3, len(words))):
            phrase = ' '.join(words[i:j+1])
            try:
                match, score = process.extractOne(phrase, glossary)
                if match and score >= threshold:
                    slang_info = slang_dict.get(match)
                    if isinstance(slang_info, dict):
                        results.append({
                            "term": phrase,
                            "matched_slang": match,
                            "meaning": slang_info.get("meaning", ""),
                            "tone": slang_info.get("tone", ""),
                            "category": slang_info.get("category", ""),
                            "variant_of": slang_info.get("variant_of", None),
                            "score": score
                        })
            except:
                continue
    # Remove duplicates, keeping highest-scoring matches
    return sorted(list({r['term']: r for r in results}.values()), key=lambda x: x['score'], reverse=True)

def get_greeting_info(message: str, greeting_dict: Dict) -> List[Dict]:
    """Extract greeting information from a message."""
    if not isinstance(message, str):
        return []
    found = []
    message = preprocess_text(message)
    for word in message.split():
        if word in greeting_dict:
            info = greeting_dict[word]
            found.append({
                "term": word,
                "meaning": info["meaning"],
                "context": info["context"],
                "tone": info["tone"],
                "variant_of": info["variant_of"]
            })
    return found

def is_greeting(text: str, greeting_keys: Set[str]) -> bool:
    """Check if text contains any greeting terms."""
    if not isinstance(text, str):
        return False
    text = preprocess_text(text)
    return any(greet in text for greet in greeting_keys)

def is_slang(text: str, slang_keys: Set[str]) -> bool:
    """Check if text contains any slang terms."""
    if not isinstance(text, str):
        return False
    text = preprocess_text(text)
    return any(term in text for term in slang_keys)

def is_code_mixed(text: str) -> bool:
    """Detect if text is code-mixed (not purely Shona)."""
    if not isinstance(text, str):
        return False
    try:
        return detect(text) != 'sn'
    except LangDetectException:
        return False

def get_sentiment(text: str) -> str:
    """Analyze sentiment of English text."""
    if not isinstance(text, str):
        return "unknown"
    try:
        if detect(text) != "en":
            return "unknown"
        score = TextBlob(text).sentiment.polarity
        if score > 0.1:
            return "positive"
        elif score < -0.1:
            return "negative"
        return "neutral"
    except:
        return "unknown"

def get_context(text: str, slang_keys: Set[str]) -> str:
    """Determine the context of the text."""
    if not isinstance(text, str):
        return 'unknown'
    text = preprocess_text(text)
    if any(x in text for x in ['mwari', 'pray', 'amen']):
        return 'religion'
    elif any(x in text for x in ['mari', 'dollar', 'float', 'bag', 'mula']):
        return 'finance'
    elif is_greeting(text, greeting_keys):
        return 'greeting'
    elif any(x in text for x in ['kuya', 'kugrafta', 'kubaya', 'chi round']):
        return 'movement'
    return 'general'

# Pre-compute keys for efficiency
greeting_keys = set(greeting_dict.keys())
slang_keys = set(slang_dict.keys())

# Apply labeling
df['message'] = df['message'].astype(str)  # Ensure message column is string type
df['greeting_info'] = df['message'].apply(lambda x: get_greeting_info(x, greeting_dict))
df['is_greeting'] = df['message'].apply(lambda x: is_greeting(x, greeting_keys))
df['is_slang'] = df['message'].apply(lambda x: is_slang(x, slang_keys))
df['slang_metadata'] = df['message'].apply(lambda x: extract_slang_metadata(x, slang_dict))
df['code_mixed'] = df['message'].apply(is_code_mixed)
df['sentiment'] = df['message'].apply(get_sentiment)
df['context'] = df['message'].apply(lambda x: get_context(x, slang_keys))
df['slang_terms'] = df['slang_metadata'].apply(lambda lst: [x['matched_slang'] for x in lst])
df['slang_meanings'] = df['slang_metadata'].apply(lambda lst: [x['meaning'] for x in lst])
df['slang_tones'] = df['slang_metadata'].apply(lambda lst: list(set(x['tone'] for x in lst if x['tone'])))
df['slang_categories'] = df['slang_metadata'].apply(lambda lst: list(set(x['category'] for x in lst if x['category'])))
df['slang_variants'] = df['slang_metadata'].apply(lambda lst: list(set(x['variant_of'] for x in lst if x['variant_of'])))
df['has_slang'] = df['slang_terms'].apply(lambda x: len(x) > 0)

# Save output to Google Drive for persistence
output_path = '/content/labeled_shona_messages.csv'
df.to_csv(output_path, index=False)



In [3]:
!pip install langdetect
!pip install fuzzywuzzy
from langdetect import detect, LangDetectException
from textblob import TextBlob
import re
import pandas as pd
from fuzzywuzzy import fuzz, process
from typing import Dict, List, Optional, Set

# Load the dataset
filepath = '/content/shona_combined_dataset.csv'
df = pd.read_csv(filepath)

greeting_dict = {
    "madii": {
        "meaning": "How are you?",
        "context": "start_conversation",
        "tone": "casual",
        "variant_of": "makadini"
    },
    "wadii": {
        "meaning": "How are you?",
        "context": "start_conversation",
        "tone": "casual",
        "variant_of": "wakadini"
    },
    "makadini": {
        "meaning": "How are you? (plural/formal)",
        "context": "start_conversation",
        "tone": "formal",
        "variant_of": None
    },
    "ndeipi": {
        "meaning": "What's up?",
        "context": "start_conversation",
        "tone": "slang",
        "variant_of": None
    },
    "mangwanani": {
        "meaning": "Good morning",
        "context": "start_conversation",
        "tone": "formal",
        "variant_of": None
    },
    "maswera sei": {
        "meaning": "Good afternoon",
        "context": "start_conversation",
        "tone": "formal",
        "variant_of": "maswera"
    },
    "manheru": {
        "meaning": "Good evening",
        "context": "start_conversation",
        "tone": "formal",
        "variant_of": None
    },
    "mukuita sei": {
        "meaning": "How are you? (plural)",
        "context": "start_conversation",
        "tone": "casual",
        "variant_of": "muri sei"
    },
    "bho here": {
        "meaning": "What's up?",
        "context": "start_conversation",
        "tone": "slang",
        "variant_of": "bho"
    },
    "safe here": {
        "meaning": "What’s going on?",
        "context": "check_in",
        "tone": "slang",
        "variant_of": "safe"
    },
    "hoyoo": {
        "meaning": "What's going on?",
        "context": "check_in",
        "tone": "casual",
        "variant_of": None
    },
    "mhoro": {
        "meaning": "Hello!",
        "context": "start_conversation",
        "tone": "neutral",
        "variant_of": None
    },
    "hesi": {
        "meaning": "Hi!",
        "context": "start_conversation",
        "tone": "casual",
        "variant_of": None
    },
    "hi": {
        "meaning": "Hi (code-mixed)",
        "context": "start_conversation",
        "tone": "code_mixed",
        "variant_of": 'hie'
    },
    "hello": {
        "meaning": "Hello (code-mixed)",
        "context": "start_conversation",
        "tone": "code_mixed",
        "variant_of": None
    },
    "hey": {
        "meaning": "Hey (code-mixed)",
        "context": "start_conversation",
        "tone": "code_mixed",
        "variant_of": None
    },
    "makadii henyu": {
        "meaning": "Very polite greeting",
        "context": "start_conversation",
        "tone": "formal",
        "variant_of": "makadini"
    },
    "tiripo": {
        "meaning": "We’re good (response)",
        "context": "response",
        "tone": "neutral",
        "variant_of": None
    },
    "tiri bho": {
        "meaning": "We're fine (slang)",
        "context": "response",
        "tone": "slang",
        "variant_of": "bho"
    },
    "masikati": {
        "meaning": "Good afternoon",
        "context": "start_conversation",
        "tone": "formal",
        "variant_of": None
    },
    "mamukasei": {
        "meaning": "How did you wake up?",
        "context": "start_conversation",
        "tone": "formal",
        "variant_of": "mamuka"
    },
    "mamuka": {
        "meaning": "Did you wake up well?",
        "context": "start_conversation",
        "tone": "formal",
        "variant_of": None
    },
    "maswera": {
        "meaning": "How was your day?",
        "context": "start_conversation",
        "tone": "formal",
        "variant_of": "maswera sei"
    },
    "zviri cei": {
        "meaning": "How are things?",
        "context": "check_in",
        "tone": "slang",
        "variant_of": None
    },
    "uri sei": {
        "meaning": "How are you?",
        "context": "start_conversation",
        "tone": "casual",
        "variant_of": None
    },
    "makasimba": {
        "meaning": "Are you well?",
        "context": "start_conversation",
        "tone": "formal",
        "variant_of": None
    },
    "rugare": {
        "meaning": "Peaceful greeting",
        "context": "start_conversation",
        "tone": "formal",
        "variant_of": None
    },
    "mushe": {
        "meaning": "Wellness greeting",
        "context": "start_conversation",
        "tone": "neutral",
        "variant_of": None
    }
}

slang_dict = {
    "bho": {
        "meaning": "cool",
        "category": "expression",
        "tone": "casual",
        "part_of_speech": "adjective",
        "variant_of": None
    },
    "mudhara": {
        "meaning": "elder/father",
        "category": "person",
        "tone": "respectful",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "jahman": {
        "meaning": "bro",
        "category": "person",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": "blaz"
    },
    "bag": {
        "meaning": "money",
        "category": "finance",
        "tone": "slang",
        "part_of_speech": "noun",
        "variant_of": "mula"
    },
    "bharanzi": {
        "meaning": "fool",
        "category": "verb",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "murungu": {
        "meaning": "boss",
        "category": "person",
        "tone": "formal",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "chigunduru": {
        "meaning": "street kid",
        "category": "person",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "chigulani": {
        "meaning": "street kid",
        "category": "person",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "bhileki": {
        "meaning": "bar",
        "category": "place",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "chuna": {
        "meaning": "tell",
        "category": "verb",
        "tone": "casual",
        "part_of_speech": "verb",
        "variant_of": None
    },
    "danda": {
        "meaning": "drug",
        "category": "drug",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "dhipisa": {
        "meaning": "complicate",
        "category": "verb",
        "tone": "casual",
        "part_of_speech": "verb",
        "variant_of": None
    },
    "dhoma": {
        "meaning": "contemplate",
        "category": "verb",
        "tone": "casual",
        "part_of_speech": "verb",
        "variant_of": None
    },
    "hant": {
        "meaning": "okay? / right?",
        "category": "expression",
        "tone": "casual",
        "part_of_speech": "interjection",
        "variant_of": None
    },
    "bigman": {
        "meaning": "bro",
        "category": "person",
        "tone": "respectful",
        "part_of_speech": "noun",
        "variant_of": "blaz"
    },
    "amana": {
        "meaning": "guys",
        "category": "group",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "hoyoo": {
        "meaning": "cool",
        "category": "expression",
        "tone": "casual",
        "part_of_speech": "adjective",
        "variant_of": "bho"
    },
    "murudo": {
        "meaning": "no hard feelings",
        "category": "emotion",
        "tone": "reassuring",
        "part_of_speech": "expression",
        "variant_of": None
    },
    "sinhi": {
        "meaning": "things",
        "category": "object",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "sinde"
    },
    "sinde": {
        "meaning": "things",
        "category": "object",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "sinhi"
    },
    "lapaz": {
        "meaning": "laptop",
        "category": "tech",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "laptop"
    },
    "momz": {
        "meaning": "mom",
        "category": "family",
        "tone": "affectionate",
        "part_of_speech": "noun",
        "variant_of": "mother"
    },
    "graft": {
        "meaning": "job",
        "category": "work",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "job"
    },
    "muripapi": {
        "meaning": "Where are you?",
        "category": "location",
        "tone": "neutral",
        "part_of_speech": "question",
        "variant_of": None
    },
    "ngomwa": {
        "meaning": "drug",
        "category": "substance",
        "tone": "informal",
        "part_of_speech": "noun",
        "variant_of": "chamba"
    },
    "jahman": {
        "meaning": "bro",
        "category": "person",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": "blaz"
    },
    "kundifendera": {
        "meaning": "provoke",
        "category": "emotion",
        "tone": "aggressive",
        "part_of_speech": "verb",
        "variant_of": None
    },
    "pamhatso": {
        "meaning": "home",
        "category": "location",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "zayan"
    },
    "jabuna": {
        "meaning": "home",
        "category": "location",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": "zayan"
    },
    "den": {
        "meaning": "house",
        "category": "location",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "home"
    },
    "zayan": {
        "meaning": "home",
        "category": "location",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "blaz": {
        "meaning": "bro",
        "category": "person",
        "tone": "slang",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "safe": {
        "meaning": "okay",
        "category": "expression",
        "tone": "casual",
        "part_of_speech": "adjective",
        "variant_of": "bho"
    },
    "donhera": {
        "meaning": "arrive",
        "category": "movement",
        "tone": "neutral",
        "part_of_speech": "verb",
        "variant_of": "svika"
    },
    "melaz": {
        "meaning": "marriage",
        "category": "relationship",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "marriage"
    },
    "chuna": {
        "meaning": "tell",
        "category": "communication",
        "tone": "neutral",
        "part_of_speech": "verb",
        "variant_of": "taura"
    },
    "tipa": {
        "meaning": "tip",
        "category": "money",
        "tone": "casual",
        "part_of_speech": "verb",
        "variant_of": "give"
    },
    "monaz": {
        "meaning": "morning",
        "category": "time",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": "mangwanani"
    },
    "skaten": {
        "meaning": "afternoon",
        "category": "time",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": "masikati"
    },
    "deepaz": {
        "meaning": "evening",
        "category": "time",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": "manheru"
    },
    "vhat": {
        "meaning": "water",
        "category": "object",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "mvura"
    },
    "madii": {
        "meaning": "Hi",
        "category": "greeting",
        "tone": "casual",
        "part_of_speech": "expression",
        "variant_of": "makadini"
    },
    "mukuita sei": {
        "meaning": "How is it going?",
        "category": "greeting",
        "tone": "casual",
        "part_of_speech": "expression",
        "variant_of": "uri sei"
    },
    "mati todii": {
        "meaning": "What’s the plan?",
        "category": "question",
        "tone": "informal",
        "part_of_speech": "expression",
        "variant_of": None
    },
    "kuchimhanya": {
        "meaning": "Do it",
        "category": "action",
        "tone": "motivational",
        "part_of_speech": "verb",
        "variant_of": "ita"
    },
    "hindava": {
        "meaning": "What’s wrong?",
        "category": "question",
        "tone": "concerned",
        "part_of_speech": "expression",
        "variant_of": None
    },
    "koso": {
        "meaning": "girl",
        "category": "person",
        "tone": "slang",
        "part_of_speech": "noun",
        "variant_of": "nyana"
    },
    "beda": {
        "meaning": "girl",
        "category": "person",
        "tone": "slang",
        "part_of_speech": "noun",
        "variant_of": "nyana"
    },
    "nyana": {
        "meaning": "girl",
        "category": "person",
        "tone": "slang",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "kubaya": {
        "meaning": "going",
        "category": "movement",
        "tone": "neutral",
        "part_of_speech": "verb",
        "variant_of": "kuenda"
    },
    "kusofter": {
        "meaning": "propose (romantically)",
        "category": "relationship",
        "tone": "casual",
        "part_of_speech": "verb",
        "variant_of": "propose"
    },
    "keta": {
        "meaning": "know",
        "category": "cognition",
        "tone": "neutral",
        "part_of_speech": "verb",
        "variant_of": "ziva"
    },
    "mbashto": {
        "meaning": "magic",
        "category": "expression",
        "tone": "playful",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "kusticker": {
        "meaning": "high (on drugs)",
        "category": "state",
        "tone": "slang",
        "part_of_speech": "verb",
        "variant_of": "kusvinura"
    },
    "pombi": {
        "meaning": "pretty",
        "category": "appearance",
        "tone": "admiring",
        "part_of_speech": "adjective",
        "variant_of": None
    },
    "bvrii": {
        "meaning": "lit / awesome",
        "category": "expression",
        "tone": "excited",
        "part_of_speech": "adjective",
        "variant_of": "bho"
    },
    "mota mota": {
        "meaning": "cool",
        "category": "expression",
        "tone": "slang",
        "part_of_speech": "adjective",
        "variant_of": "bho"
    },
    "makuruwane": {
        "meaning": "cool one",
        "category": "compliment",
        "tone": "friendly",
        "part_of_speech": "noun",
        "variant_of": "blaz"
    },
    "miswa": {
        "meaning": "stopped",
        "category": "action",
        "tone": "neutral",
        "part_of_speech": "verb",
        "variant_of": "kumira"
    },
    "bholato": {
        "meaning": "cool",
        "category": "expression",
        "tone": "slang",
        "part_of_speech": "adjective",
        "variant_of": "bho"
    },
    "dhipisa": {
        "meaning": "complicate",
        "category": "emotion",
        "tone": "frustrated",
        "part_of_speech": "verb",
        "variant_of": None
    },
    "chenga": {
        "meaning": "see",
        "category": "cognition",
        "tone": "neutral",
        "part_of_speech": "verb",
        "variant_of": "ona"
    },
    "murungu": {
        "meaning": "boss / white man",
        "category": "person",
        "tone": "respectful",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "maya": {
        "meaning": "no",
        "category": "response",
        "tone": "dismissive",
        "part_of_speech": "interjection",
        "variant_of": "mela"
    },
    "mela": {
        "meaning": "no",
        "category": "response",
        "tone": "neutral",
        "part_of_speech": "interjection",
        "variant_of": "maya"
    },
    "yesaya": {
        "meaning": "yes",
        "category": "response",
        "tone": "positive",
        "part_of_speech": "interjection",
        "variant_of": "yes"
    },
    "mujolo": {
        "meaning": "love / romantic relationship",
        "category": "relationship",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "bhebha": {
        "meaning": "burning (figuratively: strong emotion, lust)",
        "category": "emotion",
        "tone": "intense",
        "part_of_speech": "verb",
        "variant_of": None
    },
    "hwai": {
        "meaning": "coward",
        "category": "insult",
        "tone": "derogatory",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "mudhara": {
        "meaning": "elder / father",
        "category": "person",
        "tone": "respectful",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "chigunduru": {
        "meaning": "street kid",
        "category": "person",
        "tone": "derogatory",
        "part_of_speech": "noun",
        "variant_of": "chigulani"
    },
    "chigulani": {
        "meaning": "street kid",
        "category": "person",
        "tone": "derogatory",
        "part_of_speech": "noun",
        "variant_of": "chigunduru"
    },
    "gogaz": {
        "meaning": "granny",
        "category": "family",
        "tone": "affectionate",
        "part_of_speech": "noun",
        "variant_of": "gogo"
    },
    "ninez": {
        "meaning": "aunt",
        "category": "family",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "gulez"
    },
    "gulez": {
        "meaning": "aunt",
        "category": "family",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "ninez"
    },
    "khule": {
        "meaning": "uncle",
        "category": "family",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "muzaya": {
        "meaning": "nephew/niece",
        "category": "family",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "kelaz": {
        "meaning": "church",
        "category": "place",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "church"
    },
    "bhileki": {
        "meaning": "bar",
        "category": "place",
        "tone": "informal",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "mvoda": {
        "meaning": "phone",
        "category": "tech",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "foni"
    },
    "gomba": {
        "meaning": "hunger",
        "category": "need",
        "tone": "desperate",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "dhoma": {
        "meaning": "contemplate",
        "category": "cognition",
        "tone": "serious",
        "part_of_speech": "verb",
        "variant_of": None
    },
    "kuvhurika": {
        "meaning": "crazy",
        "category": "mental_state",
        "tone": "informal",
        "part_of_speech": "adjective",
        "variant_of": None
    },
    "boma": {
        "meaning": "jail",
        "category": "place",
        "tone": "negative",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "chamba": {
        "meaning": "drug",
        "category": "substance",
        "tone": "informal",
        "part_of_speech": "noun",
        "variant_of": "danda"
    },
    "jipela": {
        "meaning": "chips",
        "category": "food",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": None
    },
    "danda": {
        "meaning": "drug",
        "category": "substance",
        "tone": "informal",
        "part_of_speech": "noun",
        "variant_of": "chamba"
    },
    "bag": {
        "meaning": "money",
        "category": "finance",
        "tone": "slang",
        "part_of_speech": "noun",
        "variant_of": "mula"
    },
    "mula": {
        "meaning": "money",
        "category": "finance",
        "tone": "slang",
        "part_of_speech": "noun",
        "variant_of": "bag"
    },
    "maObama": {
        "meaning": "money",
        "category": "finance",
        "tone": "playful",
        "part_of_speech": "noun",
        "variant_of": "mula"
    },
    "kita": {
        "meaning": "die",
        "category": "life_event",
        "tone": "serious",
        "part_of_speech": "verb",
        "variant_of": "gula"
    },
    "bharanzi": {
        "meaning": "fool",
        "category": "insult",
        "tone": "derogatory",
        "part_of_speech": "noun",
        "variant_of": "bhambi"
    },
    "bhambi": {
        "meaning": "fool",
        "category": "insult",
        "tone": "derogatory",
        "part_of_speech": "noun",
        "variant_of": "bharanzi"
    },
    "mudhonhi": {
        "meaning": "baby",
        "category": "person",
        "tone": "affectionate",
        "part_of_speech": "noun",
        "variant_of": "mucheche"
    },
    "ndai": {
        "meaning": "money",
        "category": "finance",
        "tone": "slang",
        "part_of_speech": "noun",
        "variant_of": "mula"
    },
    "float": {
        "meaning": "money",
        "category": "finance",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": "mula"
    },
    "kushta": {
        "meaning": "sleep",
        "category": "action",
        "tone": "casual",
        "part_of_speech": "verb",
        "variant_of": "kurara"
    },
    "gula": {
        "meaning": "die",
        "category": "life_event",
        "tone": "serious",
        "part_of_speech": "verb",
        "variant_of": "kita"
    },
    "kudzimwa": {
        "meaning": "screwed / messed up",
        "category": "state",
        "tone": "negative",
        "part_of_speech": "verb",
        "variant_of": None
    },
    "kugula": {
        "meaning": "sick",
        "category": "health",
        "tone": "serious",
        "part_of_speech": "verb",
        "variant_of": "kurwara"
    },
    "chi round": {
        "meaning": "errand",
        "category": "activity",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": "chi circle"
    },
    "chi circle": {
        "meaning": "errand",
        "category": "activity",
        "tone": "casual",
        "part_of_speech": "noun",
        "variant_of": "chi round"
    },
    "tonaz": {
        "meaning": "town",
        "category": "place",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "guta"
    },
    "den": {
        "meaning": "house",
        "category": "place",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "home"
    },
    "manake": {
        "meaning": "nice",
        "category": "expression",
        "tone": "positive",
        "part_of_speech": "adjective",
        "variant_of": None
    },
    "fazh": {
        "meaning": "school",
        "category": "place",
        "tone": "neutral",
        "part_of_speech": "noun",
        "variant_of": "chikoro"
    }
}


def preprocess_text(text: str) -> str:
    """Preprocess text to handle Shona-specific characters and punctuation."""
    if not isinstance(text, str):
        return ""
    # Remove excessive punctuation, preserve spaces
    text = re.sub(r'[^\w\s]', ' ', text.lower())
    # Normalize Shona-specific characters (e.g., remove diacritics if needed)
    text = re.sub(r'[^\x00-\x7F]', lambda m: m.group(0).encode('ascii', 'ignore').decode('ascii'), text)
    return ' '.join(text.split())

from fuzzywuzzy import process
def extract_exact_slang(text: str, slang_dict: Dict) -> List[Dict]:
    """
    Extract exact slang matches from the text based on the slang dictionary.
    Returns a list of metadata dicts.
    """
    if not isinstance(text, str):
        return []

    words = preprocess_text(text).split()
    results = []

    for word in words:
        if word in slang_dict:
            meta = slang_dict[word]
            results.append({
                "term": word,
                "meaning": meta.get("meaning", ""),
                "category": meta.get("category", ""),
                "tone": meta.get("tone", ""),
                "part_of_speech": meta.get("part_of_speech", ""),
                "variant_of": meta.get("variant_of", None)
            })

    return results

def highlight_fuzzy_slang(text: str, slang_dict: Dict, threshold: int = 90) -> List[str]:
    """
    Return list of original words that are fuzzy matches to slang.
    Used for highlighting only (not labeling).
    """
    if not isinstance(text, str):
        return []

    glossary = list(slang_dict.keys())
    words = preprocess_text(text).split()
    matches = []

    for word in words:
        match, score = process.extractOne(word, glossary)
        if match and score >= threshold:
            matches.append(word)

    return matches

def get_greeting_info(message: str, greeting_dict: Dict) -> List[Dict]:
    """Extract greeting information from a message."""
    if not isinstance(message, str):
        return []
    found = []
    message = preprocess_text(message)
    for word in message.split():
        if word in greeting_dict:
            info = greeting_dict[word]
            found.append({
                "term": word,
                "meaning": info["meaning"],
                "context": info["context"],
                "tone": info["tone"],
                "variant_of": info["variant_of"]
            })
    return found

def is_greeting(text: str, greeting_keys: Set[str]) -> bool:
    """Check if text contains any greeting terms."""
    if not isinstance(text, str):
        return False
    text = preprocess_text(text)
    return any(greet in text for greet in greeting_keys)



def is_code_mixed(text: str) -> bool:
    """Detect if text is code-mixed (not purely Shona)."""
    if not isinstance(text, str):
        return False
    try:
        return detect(text) != 'sn'
    except LangDetectException:
        return False

def get_sentiment(text: str) -> str:
    """Analyze sentiment of English text."""
    if not isinstance(text, str):
        return "unknown"
    try:
        if detect(text) != "en":
            return "unknown"
        score = TextBlob(text).sentiment.polarity
        if score > 0.1:
            return "positive"
        elif score < -0.1:
            return "negative"
        return "neutral"
    except:
        return "unknown"

def get_context(text: str, slang_keys: Set[str]) -> str:
    """Determine the context of the text."""
    if not isinstance(text, str):
        return 'unknown'
    text = preprocess_text(text)
    if any(x in text for x in ['mwari', 'pray', 'amen']):
        return 'religion'
    elif any(x in text for x in ['mari', 'dollar', 'float', 'bag', 'mula']):
        return 'finance'
    elif is_greeting(text, greeting_keys):
        return 'greeting'
    elif any(x in text for x in ['kuya', 'kugrafta', 'kubaya', 'chi round']):
        return 'movement'
    return 'general'

# Pre-compute keys for efficiency
greeting_keys = set(greeting_dict.keys())
slang_keys = set(slang_dict.keys())

df['is_code_mixed'] = df['message'].apply(is_code_mixed)
df['sentiment'] = df['message'].apply(get_sentiment)
df['context'] = df['message'].apply(lambda x: get_context(x, greeting_keys))
df['greeting_info'] = df['message'].apply(lambda x: get_greeting_info(x, greeting_dict))
df['has_greeting'] = df['greeting_info'].apply(lambda x: len(x) > 0)
df['exact_slang'] = df['message'].apply(lambda x: extract_exact_slang(x, slang_dict))
df['has_exact_slang'] = df['exact_slang'].apply(lambda x: len(x) > 0)
df['fuzzy_slang'] = df['message'].apply(lambda x: highlight_fuzzy_slang(x, slang_dict))
df['has_fuzzy_slang'] = df['fuzzy_slang'].apply(lambda x: len(x) > 0)


#df['exact_slang_metadata'] = df['message'].apply(lambda x: extract_exact_slang(x, slang_dict))
#df['has_exact_slang'] = df['exact_slang_metadata'].apply(lambda x: len(x) > 0)

#df['fuzzy_slang_highlight'] = df['message'].apply(lambda x: highlight_fuzzy_slang(x, slang_dict))
#df['has_fuzzy_slang'] = df['fuzzy_slang_highlight'].apply(lambda x: len(x) > 0)

# Save output to Google Drive for persistence
output_path = '/content/labeled_shona_messages.csv'
df.to_csv(output_path, index=False)

