# Word2Vec using Gensim

In [77]:
import subprocess
import sys
import os

# --- AUTO-INSTALLER BLOCK ---
def maintain_dependencies():
    # pandas is no longer needed for .txt files
    required_libraries = ['numpy', 'scipy', 'gensim']
    for lib in required_libraries:
        try:
            __import__(lib)
        except ImportError:
            print(f"üì¶ Library '{lib}' not found. Installing now...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", lib])

maintain_dependencies()
# ----------------------------

import numpy as np
from scipy.stats import spearmanr, pearsonr
from gensim.models import FastText

# --- 1. Metrics Helper Functions ---

def confusion_matrix_np(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    return tn, fp, fn, tp

def accuracy_np(tp, tn, fp, fn):
    total = tp + tn + fp + fn
    return (tp + tn) / total if total > 0 else 0.0

def precision_np(tp, fp):
    return tp / (tp + fp) if (tp + fp) > 0 else 0.0

def recall_np(tp, fn):
    return tp / (tp + fn) if (tp + fn) > 0 else 0.0

def f1_np(precision, recall):
    return 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

# --- 2. IsiZulu Evaluator Class ---

class IsiZuluBenchmarkEvaluator:
    def __init__(self, model=None):
        self.model = model
        self.similarity_cache = {}
        
    def get_similarity(self, word1, word2):
        word1, word2 = str(word1).lower().strip(), str(word2).lower().strip()
        cache_key = tuple(sorted([word1, word2]))
        
        if cache_key in self.similarity_cache:
            return self.similarity_cache[cache_key]
        
        try:
            # FastText generates vectors for words it hasn't seen before
            sim = self.model.wv.similarity(word1, word2)
            self.similarity_cache[cache_key] = sim
            return sim
        except Exception:
            return None

    def evaluate(self, word_pairs):
        model_scores, human_scores = [], []
        
        for w1, w2, h_score in word_pairs:
            sim = self.get_similarity(w1, w2)
            if sim is not None:
                model_scores.append(sim)
                human_scores.append(h_score)

        if len(model_scores) < 2:
            print("‚ùå Error: Not enough valid word pairs found in the model.")
            return

        # Correlation Metrics
        rho, _ = spearmanr(human_scores, model_scores)
        pear, _ = pearsonr(human_scores, model_scores)
        
        # Binarize for Classification (using median)
        y_true = (np.array(human_scores) >= np.median(human_scores)).astype(int)
        y_pred = (np.array(model_scores) >= np.median(model_scores)).astype(int)
        
        tn, fp, fn, tp = confusion_matrix_np(y_true, y_pred)
        acc = accuracy_np(tp, tn, fp, fn)
        prec = precision_np(tp, fp)
        rec = recall_np(tp, fn)
        f1 = f1_np(prec, rec)

        print("="*45)
        print(f"Spearman Correlation: {rho:.4f}")
        print(f"Pearson Correlation:  {pear:.4f}")
        print("-" * 20)
        print(f"Precision:            {prec:.4f}")
        print(f"Recall:               {rec:.4f}")
        print(f"F1 Score:             {f1:.4f}")
        print(f"Accuracy:             {acc:.4f}")
        print("-" * 20)
        print(f"Pairs Evaluated:      {len(model_scores)}")
        print("="*45)

# --- 3. Loading Functions ---

def load_text_file(filepath):
    """Reads a .txt file and returns a list of tokenized sentences."""
    sentences = []
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                # Clean line and split into words
                tokens = line.lower().strip().split()
                if tokens:
                    sentences.append(tokens)
        return sentences
    except Exception as e:
        print(f"‚ùå Error reading {filepath}: {e}")
        return []

# --- 4. Main Execution ---

if __name__ == "__main__":
    # CONFIGURATION
    CORPUS_FILE = 'isizulu_corpus.txt'  # Your training text
    
    # Internal benchmark for testing (word1, word2, human_score)
    # You can change these to match your actual test pairs
    isi_test_pairs = [
        ("umfazi", "indoda", 6.0),
        ("ingane", "umntwana", 9.0),
        ("inja", "ikati", 4.5),
        ("isikole", "isikhungo", 8.0),
        ("ukudla", "ukuphuza", 5.5),
        ("ikhaya", "indlu", 9.2),
        ('usho', 'njalo', 6.7),
        ('uqhawekazi', 'mdikane', 9.2),
        ('umphiko', 'lwezemakumaketha', 3.4),
        ('umfazi','indoda',5.2),
        ('ingane','umntwana',9.5),
        ('indlu','ikhaya',8.7),
        ('umfula','ulwandle',6.5),
        ('isikole','isikhungo',7.8),
        ('imali','uhulumeni',4.2),
        ('uthisha','umfundi',7.5),
        ('isitsha','indishi',9.2),
        ('ibhola','umdlalo',7.8),
        ('ukudla','ukuphuza',6.5),
        ('usuku','ubusuku',3.2),
        ('umuntu','ubuntu',8.5),
        ('itheku','idolobha',9.1),

         ('inzondo', 'ucansi', 6.77),
        ('ihlosi', 'ikati', 7.35),
        ('ihlosi', 'ihlosi', 10.0),
        ('incwadi', 'iphepha', 7.46),
        ('ikhompiyutha', 'ikhibhodi', 7.62),
        ('ikhompiyutha', 'inthanethi', 7.58),
        ('indiza', 'imoto', 5.77),
        ('isitimela', 'imoto', 6.31),
        ('ucingo', 'ukuxhumana', 7.50),
        ('umabonakude', 'umsakazo', 6.77),
        ('abezindaba', 'umsakazo', 7.42),
        ('isidakamizwa', 'ukuhlukumeza', 6.85),
        ('isinkwa', 'ibhotela', 6.19),
        ('ikhukhamba', 'izambane', 5.92),
        ('udokotela', 'umhlengikazi', 7.00),
        ('solwazi', 'udokotela', 6.62),
        ('umfundi', 'solwazi', 6.81),
        ('hlakaniphile', 'umfundi', 4.62),
        ('hlakaniphile', 'isilima', 5.81),
        ('inkampani', 'amasheya', 7.08),
        ('isitoko', 'indali', 8.08),
        ('isitoko', 'ucingo', 1.62),
        ('isitoko', 'iqanda', 1.81),
        ('ukuzala', 'iqanda', 6.69),
        ('incwadi', 'umtapo wezincwadi', 7.46),
        ('ibhange', 'imali', 8.12),
    ('ukhuni', 'ihlathi', 7.73),
    ('imali', 'imali', 9.15),
    ('inkosi', 'indlovukazi', 8.58),
    ('inkosi', 'igwaba', 5.92),
    ('umbhishobhi', 'uRabi', 6.69),
    ('inyoni', 'iqhude', 7.10),
    ('inyoni', 'igwaba', 7.38),
    ('ithuluzi', 'qalisa', 6.46),
    ('umfana', 'mfowethu', 4.46),
    ('uhambo', 'imoto', 5.85),
    ('imali', 'idola', 8.42),
    ('imali', 'ingcebo', 8.27),
    ('imali', 'impahla', 7.57),
    ('imali', 'ibhange', 8.50),
    ('imali', 'ukufaka imali', 7.73),
    ('imali', 'ukuhoxa', 6.88),
    ('imali', 'ukuwasha imali', 5.65),
    ('ihlosi', 'isilwane', 7.00),
    ('ihlosi', 'izilwane', 5.62),
    ('ihlosi', 'i-zoo', 5.87),
    ('ukusebenza kwengqondo', 'ukwelashwa kwengqondo', 8.08),
    ('ukusebenza kwengqondo', 'ukukhathazeka', 7.00),
    ('ukusebenza kwengqondo', 'uvalo', 6.85),
    ('ukusebenza kwengqondo', 'ukudana', 7.42),
    ('ukusebenza kwengqondo', 'ingqondo', 7.69),
    ('unozungezilanga', 'inkanyezi', 8.45),
    ('unozungezilanga', 'inyanga', 8.08),
    ('unozungezilanga', 'ilanga', 8.02),
    ('unozungezilanga', 'Umthala', 8.11),
    ('inkomishi', 'ikhofi', 6.58),
    ('inkomishi', 'isiphuzo', 7.25),
    ('inkomishi', 'uketshezi', 5.90),
    ('amandla', 'Inkinga', 5.94),
    ('izindaba', 'umbiko', 8.16),
    ('impi', 'amabutho', 8.13),
    ('isikhumba', 'ihlo', 6.22),
    ('impilo', 'ukufa', 7.88),
    ('isikweletu', 'ikhadi', 8.06),
    ('ihhotela', 'isabelo', 8.03),
    ('ikhabethe', 'impahla', 8.00),
    ('umgwaqo', 'indlela', 8.88),
    ('iseli', 'ucingo', 7.81),
    ('izabelo', 'inkokhelo', 7.63),
    ('isibalo', 'ukubala', 8.44),
    ('imali', 'emakethe', 7.50),
    ('uwoyela', 'isitoko', 6.34),
    ('inzuzo', 'ukulahlekelwa', 7.63),
    ('idola', 'i-yen', 7.78),
    ('ikhompiyutha', 'i-software', 8.50),
    ('inethiwekhi', 'i-hardware', 8.31),

     ('ijubane', 'i-marathon', 7.47),
    ('umdlalo', 'game', 6.19),
    ('nqoba', 'defeat', 6.97),
    ('uchungechunge', 'series', 3.56),
    ('ulwandle', 'sea', 7.47),
    ('ukudla', 'food', 8.34),
    ('isikhuphashe', 'lobster', 8.7),
    ('ukudla', 'food', 7.81),
    ('iwayini', 'wine', 5.7),
    ('ukulungiselela', 'preparation', 6.22),


 ("Penny", "Ntuli", 9.0),
    ("umsakazi", "Penny", 8.0),
    ("umsakazi", "Jozi FM", 7.0),
    ("Jozi FM", "abalaleli", 7.0),
    ("Jozi FM", "abaphathi", 8.0),
    ("abalaleli", "izilimi", 6.5),
    ("izindondo", "Radio Presenter", 8.5),
    ("Mnuz Tshepo Makgopa", "Bhebhe", 7.5),
    ("Gagasi FM", "Jozi FM", 6.0),
    ("uhola", "imali", 7.5),
    ("uhola", "R2 800", 9.0),
    ("iminyaka", "izinyanga", 6.5),
    ("ulwazi", "izilimi", 7.0),
    ("ukukhuluma", "abalaleli", 8.0),
    ("ngiyabonga", "kozakwethu", 7.5),
    ("ukubekezelela", "abalaleli", 6.5),
    ("uhambo", "imsakazo", 7.0),
    ("ithuba", "amathuba", 8.0),
    ("iminyaka", "isikhathi", 6.0),
    ("imali", "uhola", 7.5),
    ("R2 800", "imali", 9.0),
    ("ngiphinde", "ngabuyela", 6.5),

     ("DJ Warras", "umangalelwa", 7.0),
    ("DJ Warras", "ubulawa", 9.0),
    ("umangalelwa", "icala", 8.0),
    ("icala", "inkantolo", 7.5),
    ("Armindo Pacula", "umangalelwa", 9.0),
    ("Armindo Pacula", "Mozambique", 6.5),
    ("Primrose", "Marathon", 6.0),
    ("uhlelo", "bheyili", 7.0),
    ("bheyili", "icala", 6.5),
    ("Victor Majola", "umangalelwa wokuqala", 8.5),
    ("Victor Majola", "isitokisini", 7.5),
    ("Febhuwari 9", "icala", 6.0),
    ("Febhuwari 11", "inkantolo", 6.5),
    ("amaphepha", "ikheli", 7.0),
    ("amaphepha", "ubufakazi", 6.5),
    ("ikheli", "umsolwa", 6.0),
    ("pasi", "amaphoyisa", 7.5),
    ("umphakathi", "inkantolo", 6.0),
    ("ukuhlolwa", "icala", 6.5),
    ("ukuboshwa", "umangalelwa", 8.0),
    ("ukwephula", "imithetho", 8.5),
    ("Ningizimu Afrika", "abantu baseMozambique", 6.5),

    ("iphoyisa", "ummangalelwa", 6.8),
("umphikisi", "ummangalelwa", 8.0),
("isibhamu", "umsebenzi", 5.2),
("ethaveni", "etafuleni", 7.5),
("umufi", "usihlalo", 4.0),
("umufi", "umngani", 6.0),
("ukudubula", "ukushaya", 7.2),
("ubufakazi", "ofakazi", 8.5),
("isidumbu", "udokotela", 6.3),
("umthetho", "iphoyisa", 7.8),
("iminyaka", "isigwebo", 8.7),
("imoto", "emoyeni", 5.5),
("ukuzinikela", "ukuboshwa", 7.0),
("indawo", "ethaveni", 7.6),
("isibhamu", "ububi", 6.0),

("ubufakazi", "ofakazi", 8.5),
("umthetho", "ezokuphepha", 7.0),
("ibhilidi", "ehhovisi", 6.5),
("imoto", "isikhiye", 5.5),
("unobhala", "ikhomishana", 6.0),
("umyalelo", "ukumkhipha", 7.2),
("ikhamera", "okuqapha", 8.0),
("ukusayina", "irejista", 8.0),
("imisebenzi", "amapheshana", 7.5),
("umphathi", "Lieutenant General", 6.8),
("ukushiya", "ukubuyisela", 6.0),
("isimo", "okungajwayelekile", 7.3),

("ibhilidi", "ehhovisi", 6.5),
("umsolwa", "ufakazi", 7.5),
("inkosi", "uChonco", 6.0),
("icala", "ubufakazi", 8.0),
("ukubulawa", "okusoconga", 7.8),
("inkantolo", "umasipala", 6.5),
("umndeni", "amalungu", 7.0),
("umholi", "ummeli", 6.2),
("umthombo", "uphenyo", 7.3),
("ukuvuma", "ukufakaza", 8.1),
("ukudluliselwa", "ukususiwe", 7.0),
("ubufakazi", "izinto eziveziwe", 7.9),
("ukushaya ngolonwabu", "ukukhathaza", 6.8),
("ejeleni", "ukuboshwa", 8.0),
("amacala", "icala elibucayi", 7.6),
("ukuhlala ejele", "ukubukeka idangele", 6.5),
("okubulala", "icala lokubulala", 8.3),
("induna", "umuntu okumele akhombwe", 6.7),
("ukuthatha isinqumo", "ukuzophawula", 6.0),
("umthetho", "ukudluliselwa", 7.2),
("umkhankaso", "uhlelo", 6.8),
("ukuhlela", "ukuphatha", 7.0),
("umsolwa", "ukuboshwa", 7.5),
("ukuhlolwa", "uphenyo", 7.3),
("ukwahlulela", "inkantolo", 7.8),
("ubufakazi", "ukuvuma", 8.2),
("umcimbi", "icala", 6.5),
("ukusolwa", "ukuboshwa", 7.0),
("okungokomthetho", "icala", 7.4),

("imiphumela", "amaphuzu", 8.2),
("ukungeneme", "ukunganeliseki", 8.5),
("ukucela", "isicelo", 7.8),
("ukumakelwa kabusha", "ukuphinde umakelwe", 8.7),
("iphepha", "isivivinyo", 7.5),
("ukumakwa", "ukuhlolwa", 7.9),
("imiphumela ka-matric", "izimpendulo zokuhlolwa", 7.6),
("umnyango", "wezeMfundo eyisiSekelo", 8.0),
("ukubona iphepha", "ukuhlolwa kabusha", 7.4),
("izicelo", "ukubhalisa", 7.8),
("ukuvulwa", "ukuvalwa", 6.5),
("izinsuku eziyisikhombisa", "isikhathi sokuvala", 6.8),
("imiphumela yokumakelwa kabusha", "izimpendulo ezintsha", 7.7),
("isikhungo", "isikole", 7.5),
("ukuphucula izifundo", "ukwenza ngcono imiphumela", 8.4),
("izivivinyo zokuchibiyela", "izivivinyo zangoMeyi/Juni", 8.2),
("ukubhala kabusha", "ukuphinda ukubhala", 8.6),
("ubufakazi", "izizathu ezinqala", 7.3),
("ukugula", "izizathu zokungabhalanga", 7.0),
("School Based Assessment", "SBA", 9.0),
("u-matric omdala", "ababhala kusukela ngo-2008", 7.9),
("ukubhalisa", "ukufaka izicelo", 8.1),
("Life Orientation", "izifundo eziyisikhombisa", 6.9),
("Public Adult Education Centre", "isikhungo sabantu abadala", 8.0),
("amakolishi amakhono", "izikhungo zemfundo", 7.6),

("odabule o-A", "impumelelo ephezulu", 8.8),
("izifundo", "amabanga", 7.6),
("ukudela", "ukuzinikela", 8.5),
("ubuthongo", "ukulala", 9.0),
("umsebenzi", "imisebenzi yesikole", 7.8),
("ukulalela ebusuku", "ukulala sekuhlwile", 8.2),
("ukuya esikoleni", "imfundo", 7.5),
("ukufundela ubudokotela", "izifundo zezokwelapha", 8.9),
("Computer Science", "izifundo zobuchwepheshe", 7.9),
("izifundo zasebusuku", "izifundo zangempelasonto", 8.3),
("ukuzilungiselela", "ukuzimisela", 8.4),
("Aucturial Science", "izibalo ezisezingeni eliphezulu", 8.1),
("isicelo sokufunda", "ukufaka isicelo", 8.7),
("University of Cape Town", "inyuvesi", 7.4),
("abazali", "abafundi", 6.5),
("isikhala sokufunda", "ithuba lokufunda", 8.2),
("idumela lesikole", "udumo", 8.0),
("ukungena belakanyana", "ukuqoqana", 7.8),
("impumelelo", "imiphumela emihle", 8.6),
("ukuzimisela kwabafundi", "ukusebenza kanzima", 8.5),

("IFP", "iqembu", 8.8),
("isifundazwe", "KwaZulu-Natal", 9.5),
("KwaZulu", "ubukhosi", 8.6),
("iSilo", "uMisuzulu", 9.4),
("inkulumo", "isimemezelo", 8.1),
("ukungeseka", "ukweseka", 9.6),
("igama", "isifundazwe", 7.9),
("ubukhosi", "uZulu", 9.2),
("umlando", "amasiko", 8.7),
("amaKhosi", "ubukhosi", 9.0),
("iShayamthetho", "uhulumeni", 7.5),
("uMgungundlovu", "inhlokodolobha", 8.4),
("ukushintshwa", "uguquko", 8.9),
("izifundazwe", "imingcele", 7.8),
("iCodesa", "izingxoxo", 8.2),
("iLembe", "uZulu", 8.6),
("uNatal", "ikholoni", 7.0),
("amagama", "izindawo", 7.6),
("isigodlo", "iSilo", 8.8),
("ukuzithuka", "inhlamba", 8.3),

("izikole", "izikhungo zemfundo", 8.4),
("i-stationery", "izinsiza zokufunda", 9.0),
("imali", "ama-norms and standards", 8.6),
("uMnyango wezeMfundo", "uMgcinimafa", 7.2),
("inyunyana yothisha", "Sadtu", 9.2),
("ukuya enkantolo", "isinyathelo somthetho", 8.7),
("ukuphoqa", "umyalo wenkantolo", 8.3),
("othishanhloko", "abaphathi bezikole", 8.5),
("abazali", "umphakathi", 7.0),
("umnikelo wemali", "ukukhokha", 7.4),
("ukuphula umthetho", "icala", 7.6),
("ukungabi khona kwezinsiza", "inhlupheko", 7.8),
("ukubhikisha", "ukuya emgwaqweni", 8.9),
("isixazululo sesikhashana", "ukunqoba kwesikhashana", 7.1),
("umyalelo wenkantolo", "isinqumo", 8.5),
("izikhulu zoMnyango", "abaholi boMnyango", 8.0),
("ukugunyaza", "ukuvunywa", 8.6),
("ukuthengwa kwezincwadi", "ukuhlinzekwa", 7.9),
("ukubambezeleka", "ukuphuzisa", 8.2),
("inhlupheko eKZN", "inhllekelele", 8.8),
("izikole ezingakhokhisi", "izikole ezingenamali yesikole", 8.4),
("ukuvulwa kwezikole", "ukuqala konyaka", 7.3),
("ukungaqiniseki", "ukukhathazeka", 8.1),
("osonkontileka", "abahlinzeki", 8.5),
("izikweletu", "ukungakhokhelwa", 8.7),
("omashonisa", "ababolekisi", 8.0),
("ukuphatha izikole ngemali yethu", "ukuzidela", 7.9),
("ukungakhulumi iqiniso", "ukuqamba amanga", 8.3),
("izethembiso", "ukuqinisekisa", 7.5),

("ingozi", "isibhicongo", 8.8),
("umgwaqo", "u-R102", 6.9),
("itekisi", "iloli", 7.2),
("ukushayisana", "ingozi yomgwaqo", 8.9),
("imiphefumulo", "abashonile", 8.5),
("ingane", "umfundi", 7.6),
("abasindile", "abalimele", 7.4),
("abezimo eziphuthumayo", "abatakuli", 8.7),
("ukutakula", "ukusiza", 8.2),
("izibhedlela", "ukwelashwa", 7.8),
("umshayeli wetekisi", "umshayeli weloli", 6.8),
("izincwadi zokushayela", "ilayisense", 9.1),
("eziphelelwe yisikhathi", "ezingasebenzi", 8.6),
("ukulayisha ngokweqile", "ukweqa umthwalo", 8.4),
("amasondo", "isimo semoto", 7.5),
("umgwaqo ongaphephile", "ingozi", 7.9),
("umnikazi weloli", "umnikazi wemoto", 7.7),
("ukubhekana nomthetho", "ukujeziswa", 8.8),
("inzuzo", "ukuzenzela imali", 7.3),
("ukuphepha", "isimiso sokuphepha", 8.1),
("izithuthi", "izimoto", 8.6),
("ukuphuthunyiswa", "ukuthunyelwa ngokushesha", 8.0),
("ingozi enyantisa igazi", "isigameko esibi", 8.7),
("isiphihli sengozi", "imbangela yengozi", 8.3),
("izinsuku zokukhumbula", "ukukhumbuleka", 6.9),
("abafundi", "abantwana besikole", 8.4),
("ukuvakashela indawo", "ukuhlola indawo", 7.5),

("i-maths", "i-maths literacy", 7.2),
("izikole zamabanga aphezulu", "high schools", 8.4),
("abafundi", "izingane zesikole", 8.0),
("umfundi", "umfundi ophumelelayo", 7.5),
("imikhakha ehlukene", "izifundo ezahlukene", 7.8),
("u-matric", "ukuhlolwa kokugcina", 8.1),
("ikhwelo", "ukuncishiswa kwamathuba", 7.6),
("i-KZN", "KwaZulu-Natal", 9.0),
("Eastern Cape", "Limpopo", 8.2),
("Western Cape", "izifundazwe", 7.5),
("abazali", "umphakathi", 7.0),
("abafundi basemakhaya", "abafundi besifunda", 8.3),
("abafudukela kwezinye izindawo", "ukuhamba kwabafundi", 7.9),
("o-thisha abenele", "teachers available", 8.5),
("izinto zokuthutha", "ukuthutha kwabafundi", 7.8),
("ukuhlanganisa izikole", "school consolidation", 7.7),
("isibalo sabafundi", "number of students", 8.0),
("emfundweni yasemazingeni aphansi", "primary education", 7.6),
("ukukhetha izifundo", "subject selection", 8.2),
("imfundo ephakeme", "higher education", 8.4),
("abashiywe ngaphandle", "excluded students", 7.8),
("ukugqugquzelwa", "ukukhuthazwa", 7.5),
("ukwenza lesi sifundo", "ukufunda i-maths", 8.1),
("okumele iphele le", "must end this", 7.3),
("izikole ezingayifundisi nhlobo i-maths", "schools without maths", 8.9),
("ukukhathazeka", "concern", 8.0),

("ikhansela", "uNkosi", 8.5),
("umhlangano womkhandlu", "isiphakamiso", 7.4),
("abaphazamisi", "abaqaphi", 8.0),
("umenenja yomkhandlu", "uMpumelelo Mnguni", 7.8),
("ukukhishwa emhlanganweni", "ukususwa", 8.1),
("ukunyuswa kwemiholo", "ukuphakanyiswa", 7.6),
("imali yokuhambisa izidingo", "ukuxhasa izidingo", 7.9),
("ukuxakazisana", "ukugaxekwa", 8.2),
("ukubathuka", "ukuhlukumeza", 7.7),
("ukushaywa", "ukudlwengulwa", 8.3),
("ukuvula icala", "ukubika icala", 7.8),
("umqaphi", "umqaphi wendawo", 7.6),
("ukuphindisela", "ukuziphindisela", 8.0),
("ukuthinteka kabi", "ukulimala ngokomzwelo", 7.9),
("ubufakazi", "umbiko", 8.1),
("iPhalamende", "ikomiti leMfundo", 7.5),
("ukuphuthuma", "ukuphuthumisa", 7.4),
("ukuxhumana", "ukuxhumana noNkosi", 7.6),
("ukushicilela", "intatheli", 7.8),
("ukumangalelwa", "amacala avuliwe", 8.2),
("ukuthumela imibuzo", "ukubuza imibuzo", 7.7),
("ukwethembisa", "ukuzibophezela", 8.0),
("indawo eMgungundlovu", "indawo yomhlangano", 7.5),
("intatheli", "abezindaba", 8.1),
("ukuphazamisa", "ukuphikisa", 7.9),

("UBONGE", "ukubonga", 8.9),
("izaguga", "ugogo nomkhulu", 8.5),
("umfundi", "USinokholo Mhlungu", 8.2),
("eVryheid", "eSolomuzi High", 7.4),
("imali engango-R5 000", "uxhaso lwezeMfundo", 8.0),
("izifundo zabo", "ukufunda ezikhungweni", 8.1),
("izikhungo eziphambili zemfundo", "izikole ezisezingeni", 8.3),
("ukuzimisela", "ukusebenza kanzima", 8.6),
("abahamba phambili", "abaphambili ezifundweni", 8.4),
("ukusebenza kanzima", "ukuzinikela", 8.5),
("ukwazi ukuzimisela", "ukuzikhandla", 8.2),
("ukusebenza ezifundweni", "impumelelo", 8.0),
("izikole ezakhele iZululand", "izikole eziphambili", 7.9),
("ugogo nomkhulu", "izaguga ezamkhulisa", 8.6),

("ukuzimisela", "ukwenza kahle", 8.6),
("izifundo", "izifundo zami", 8.2),
("Grade 10", "imfundo ephakeme", 7.5),
("ukwenza kahle", "impumelelo", 8.4),
("imali yoxhaso", "uxhaso lwezeMfundo", 8.3),
("ukubhalisa", "ukubhalisela izikole", 7.8),
("Masipala", "uMasipala", 9.0),
("o-thisha", "abafundisi", 7.6),
("iMaths", "iPhysics", 7.9),
("amathuba omsebenzi", "amathuba emisebenzi", 8.1),
("ukuphasa kahle", "impumelelo yezifundo", 8.5),
("abafundi", "izingane zesikole", 8.0),
("ukuzisa izidingo emphakathini", "ukuletha usizo", 7.7),
("ukuthola uxhaso", "ukuxhaswa", 8.2),
("ukubonga abafundi", "ukudumisa abafundi", 8.4),
("ukungenelela kwabafundi", "ukuzimisela kwabafundi", 8.0),
("umsebenzi wabafundi", "ukusebenza kahle", 7.9),
("imali ayichazi", "uxhaso olungachazwanga", 7.5),
("impumelelo", "izithelo ezinhle", 8.6),
("ukuphumelela kwezikole", "izikole eziphase kahle", 8.3)

    ]

    if os.path.exists(CORPUS_FILE):
        print(f"üìÇ Loading text corpus from {CORPUS_FILE}...")
        sentences = load_text_file(CORPUS_FILE)
        
        if sentences:
            print(f"üöÄ Training FastText model on {len(sentences)} sentences...")
            # We train for 30 epochs to ensure learning on smaller txt files
            model = FastText(sentences=sentences, vector_size=50, window=3, min_count=1, epochs=150, min_n=3,max_n=6)
            
            evaluator = IsiZuluBenchmarkEvaluator(model=model)
            evaluator.evaluate(isi_test_pairs)
        else:
            print("üõë The text file is empty.")
    else:
        # Create a dummy file if it doesn't exist just so you can see it work
        print(f"üõë File '{CORPUS_FILE}' not found.")
        print("Creating a sample 'isizulu_corpus.txt' for you...")
        with open(CORPUS_FILE, 'w', encoding='utf-8') as f:
            f.write("umfazi nendoda bahamba esikoleni.\ningane idla ukudla kwayo.\n")
        print("Run the script again now that the file exists!")

üìÇ Loading text corpus from isizulu_corpus.txt...
üöÄ Training FastText model on 755 sentences...
Spearman Correlation: 0.1468
Pearson Correlation:  0.1360
--------------------
Precision:            0.5700
Recall:               0.5488
F1 Score:             0.5592
Accuracy:             0.5507
--------------------
Pairs Evaluated:      414
