# Word2Vec using Gensim

In [2]:
import subprocess
import sys
import os
import csv

# --- AUTO-INSTALLER BLOCK ---
def maintain_dependencies():
    required_libraries = ['numpy', 'scipy', 'gensim']
    for lib in required_libraries:
        try:
            __import__(lib)
        except ImportError:
            print(f"üì¶ Library '{lib}' not found. Installing now...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", lib])

maintain_dependencies()
# ----------------------------

import numpy as np
from scipy.stats import spearmanr, pearsonr
from gensim.models import FastText

# --- COSINE SIMILARITY ---
def cosine_similarity(vec1, vec2):
    """Calculate cosine similarity: (A ¬∑ B) / (||A|| √ó ||B||)"""
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    
    if norm1 == 0 or norm2 == 0:
        return 0.0
    
    return dot_product / (norm1 * norm2)

# --- CLASSIFICATION METRICS ---
def confusion_matrix_np(y_true, y_pred):
    """Calculate confusion matrix components"""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    return tn, fp, fn, tp

def accuracy_np(tp, tn, fp, fn):
    """Accuracy = (TP + TN) / (TP + TN + FP + FN)"""
    total = tp + tn + fp + fn
    return (tp + tn) / total if total > 0 else 0.0

def precision_np(tp, fp):
    """Precision = TP / (TP + FP)"""
    return tp / (tp + fp) if (tp + fp) > 0 else 0.0

def recall_np(tp, fn):
    """Recall = TP / (TP + FN)"""
    return tp / (tp + fn) if (tp + fn) > 0 else 0.0

def f1_np(precision, recall):
    """F1 Score = 2 * (Precision * Recall) / (Precision + Recall)"""
    return 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

# --- FILE LOADING ---
def load_text_file(filepath):
    """Reads a .txt file and returns a list of tokenized sentences."""
    sentences = []
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                tokens = line.lower().strip().split()
                if tokens:
                    sentences.append(tokens)
        return sentences
    except Exception as e:
        print(f"‚ùå Error reading {filepath}: {e}")
        return []

# --- MAIN EXECUTION ---
if __name__ == "__main__":
    CORPUS_FILE = 'isizulu_corpus.txt'
    OUTPUT_CSV = 'isizulu_results_with_metrics.csv'
    
    # Test pairs
    isi_test_pairs = [
        # ("umfazi", "indoda", 6.0),
        # ("ingane", "umntwana", 9.0),
        # ("inja", "ikati", 4.5),
        # ("isikole", "isikhungo", 8.0),
        # ("ukudla", "ukuphuza", 5.5),
        # ("ikhaya", "indlu", 9.2),
        # ('usho', 'njalo', 6.7),
        # ('uqhawekazi', 'mdikane', 9.2),
        # ('umphiko', 'lwezemakumaketha', 3.4),
        # ('umfazi','indoda',5.2),
        # ('ingane','umntwana',9.5),
        # ('indlu','ikhaya',8.7),
        # ('umfula','ulwandle',6.5),
        # ('isikole','isikhungo',7.8),
        # ('imali','uhulumeni',4.2),
        # ('uthisha','umfundi',7.5),
        # ('isitsha','indishi',9.2),
        # ('ibhola','umdlalo',7.8),
        # ('ukudla','ukuphuza',6.5),
        # ('usuku','ubusuku',3.2),
        # ('umuntu','ubuntu',8.5),
        # ('itheku','idolobha',9.1),
        # ('inzondo', 'ucansi', 6.77),
        # ('ihlosi', 'ikati', 7.35),
        # ('ihlosi', 'ihlosi', 10.0),
        # ('incwadi', 'iphepha', 7.46),
        # ('ikhompiyutha', 'ikhibhodi', 7.62),
        # ('ikhompiyutha', 'inthanethi', 7.58),
        # ('indiza', 'imoto', 5.77),
        # ('isitimela', 'imoto', 6.31),
        # ('ucingo', 'ukuxhumana', 7.50),
        # ('umabonakude', 'umsakazo', 6.77),
        # ('abezindaba', 'umsakazo', 7.42),
        # ('isidakamizwa', 'ukuhlukumeza', 6.85),
        # ('isinkwa', 'ibhotela', 6.19),
        # ('ikhukhamba', 'izambane', 5.92),
        # ('udokotela', 'umhlengikazi', 7.00),
        # ('solwazi', 'udokotela', 6.62),
        # ('umfundi', 'solwazi', 6.81),
        # ('hlakaniphile', 'umfundi', 4.62),
        # ('hlakaniphile', 'isilima', 5.81),
        # ('inkampani', 'amasheya', 7.08),
        # ('isitoko', 'indali', 8.08),
        # ('isitoko', 'ucingo', 1.62),
        # ('isitoko', 'iqanda', 1.81),
        # ('ukuzala', 'iqanda', 6.69),
        # ('incwadi', 'umtapo wezincwadi', 7.46),
        # ('ibhange', 'imali', 8.12),
        # ('ukhuni', 'ihlathi', 7.73),
        # ('imali', 'imali', 9.15),
        # ('inkosi', 'indlovukazi', 8.58),
        # ('inkosi', 'igwaba', 5.92),
        # ('umbhishobhi', 'uRabi', 6.69),
        # ('inyoni', 'iqhude', 7.10),
        # ('inyoni', 'igwaba', 7.38),
        # ('ithuluzi', 'qalisa', 6.46),
        # ('umfana', 'mfowethu', 4.46),
        # ('uhambo', 'imoto', 5.85),
        # ('imali', 'idola', 8.42),
        # ('imali', 'ingcebo', 8.27),
        # ('imali', 'impahla', 7.57),
        # ('imali', 'ibhange', 8.50),
        # ('imali', 'ukufaka imali', 7.73),
        # ('imali', 'ukuhoxa', 6.88),
        # ('imali', 'ukuwasha imali', 5.65),
        # ('ihlosi', 'isilwane', 7.00),
        # ('ihlosi', 'izilwane', 5.62),
        # ('ihlosi', 'i-zoo', 5.87),

    ('inkosi', 'imeya', 8.45),
    ('imali', 'isikweletu', 7.12),
    ('uhulumeni', 'umasipala', 8.90),
    ('inkohlakalo', 'icala', 7.50),
    
    # Imfundo no-Matric
    ('isikole', 'inyuvesi', 8.20),
    ('umfundi', 'uthisha', 7.65),
    ('izifundo', 'imiphumela', 6.80),
    ('u-matric', 'isivivinyo', 9.10),
    
    # Umculo Nobuciko
    ('ingoma', 'icwecwe', 8.55),
    ('umrepha', 'umculi', 9.25),
    ('ikhwaya', 'umbhalo', 4.10),
    ('idume', 'izindondo', 6.40),
    
    # Amaphoyisa Nobugebengu
    ('amaphoyisa', 'abasolwa', 8.15),
    ('isibhamu', 'inhlamvu', 9.40),
    ('ubunhloli', 'umkhondo', 8.70),
    ('isiteshi', 'inkantolo', 6.95),
    
    # Indawo Nezokuvakasha
    ('idolobha', 'ilokishi', 7.30),
    ('isivakashi', 'ihhotela', 8.85),
    ('ingqalasizinda', 'ukuthuthukiswa', 7.75),
    ('emakhaya', 'iphesheya', 3.20),

    # Amagama Angahlobene (Negative Controls)
    ('itekisi', 'ubudokotela', 1.15),
    ('umculo', 'isifo', 0.90),
    ('u-matric', 'ubumnandi', 2.50),
    ('inkosi', 'igwaba', 1.05)




    ]

    if not os.path.exists(CORPUS_FILE):
        print(f"üõë File '{CORPUS_FILE}' not found.")
        print("Creating a sample file...")
        with open(CORPUS_FILE, 'w', encoding='utf-8') as f:
            f.write("umfazi nendoda bahamba esikoleni.\ningane idla ukudla kwayo.\n")
        print("‚úÖ Sample file created! Run the script again.")
        sys.exit(0)

    print(f"üìÇ Loading corpus from {CORPUS_FILE}...")
    sentences = load_text_file(CORPUS_FILE)
    
    if not sentences:
        print("üõë The text file is empty.")
        sys.exit(1)

    print(f"üöÄ Training FastText model on {len(sentences)} sentences...")
    print(f"   Parameters: vector_size=50, window=3, epochs=150, min_n=3, max_n=6\n")
    
    model = FastText(
        sentences=sentences, 
        vector_size=50, 
        window=3, 
        min_count=1, 
        epochs=150, 
        min_n=3,
        max_n=6
    )
    
    print(f"‚úÖ Model trained! Vocabulary size: {len(model.wv)}\n")
    
    # Calculate cosine similarities
    print("="*90)
    print("CALCULATING COSINE SIMILARITIES")
    print("="*90)
    
    cosine_scores = []
    human_scores = []
    results = []
    
    print(f"{'Word 1':<25} {'Word 2':<25} {'Human':<10} {'Cosine':<10}")
    print("-"*90)
    
    for w1, w2, h_score in isi_test_pairs:
        try:
            vec1 = model.wv[w1]
            vec2 = model.wv[w2]
            cos_sim = cosine_similarity(vec1, vec2)
            
            print(f"{w1:<25} {w2:<25} {h_score:<10.2f} {cos_sim:<10.6f}")
            
            cosine_scores.append(cos_sim)
            human_scores.append(h_score)
            
            results.append({
                'word1': w1,
                'word2': w2,
                'human_score': h_score,
                'cosine_similarity': cos_sim
            })
        except KeyError:
            print(f"{w1:<25} {w2:<25} {h_score:<10.2f} {'N/A':<10}")
    
    print("="*90)
    
    if len(cosine_scores) < 2:
        print("‚ùå Not enough valid pairs to calculate metrics.")
        sys.exit(1)
    
    # --- CALCULATE ALL METRICS ---
    print("\n" + "="*90)
    print("EVALUATION METRICS")
    print("="*90)
    
    # 1. Correlation Metrics
    rho, rho_p = spearmanr(human_scores, cosine_scores)
    pear, pear_p = pearsonr(human_scores, cosine_scores)
    
    print(f"\nüìä CORRELATION METRICS:")
    print(f"   Spearman Correlation:     {rho:.6f} (p-value: {rho_p:.6f})")
    print(f"   Pearson Correlation:      {pear:.6f} (p-value: {pear_p:.6f})")
    
    # 2. Classification Metrics (binarize using median)
    human_median = np.median(human_scores)
    cosine_median = np.median(cosine_scores)
    
    y_true = (np.array(human_scores) >= human_median).astype(int)
    y_pred = (np.array(cosine_scores) >= cosine_median).astype(int)
    
    tn, fp, fn, tp = confusion_matrix_np(y_true, y_pred)
    accuracy = accuracy_np(tp, tn, fp, fn)
    precision = precision_np(tp, fp)
    recall = recall_np(tp, fn)
    f1 = f1_np(precision, recall)
    
    print(f"\nüìà CLASSIFICATION METRICS (Median Threshold):")
    print(f"   Accuracy:                 {accuracy:.6f}")
    print(f"   Precision:                {precision:.6f}")
    print(f"   Recall:                   {recall:.6f}")
    print(f"   F1 Score:                 {f1:.6f}")
    
    print(f"\nüîç CONFUSION MATRIX:")
    print(f"   True Positives (TP):      {tp}")
    print(f"   True Negatives (TN):      {tn}")
    print(f"   False Positives (FP):     {fp}")
    print(f"   False Negatives (FN):     {fn}")
    
    print(f"\nüìù COVERAGE:")
    print(f"   Pairs Evaluated:          {len(cosine_scores)}/{len(isi_test_pairs)}")
    print(f"   Coverage Rate:            {len(cosine_scores)/len(isi_test_pairs)*100:.2f}%")
    
    print(f"\nüìä SCORE STATISTICS:")
    print(f"   Cosine Similarity:")
    print(f"      Min:     {min(cosine_scores):.6f}")
    print(f"      Max:     {max(cosine_scores):.6f}")
    print(f"      Mean:    {np.mean(cosine_scores):.6f}")
    print(f"      Median:  {cosine_median:.6f}")
    print(f"      Std Dev: {np.std(cosine_scores):.6f}")
    print(f"   Human Scores:")
    print(f"      Min:     {min(human_scores):.2f}")
    print(f"      Max:     {max(human_scores):.2f}")
    print(f"      Mean:    {np.mean(human_scores):.2f}")
    print(f"      Median:  {human_median:.2f}")
    print(f"      Std Dev: {np.std(human_scores):.2f}")
    
    print("="*90)
    
    # Save to CSV
    with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['word1', 'word2', 'human_score', 'cosine_similarity']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        for row in results:
            writer.writerow(row)
    
    print(f"\n‚úÖ Results saved to '{OUTPUT_CSV}'")
    
    # Save metrics summary
    metrics_file = 'evaluation_metrics.txt'
    with open(metrics_file, 'w', encoding='utf-8') as f:
        f.write("ISIZULU WORD SIMILARITY EVALUATION METRICS\n")
        f.write("="*60 + "\n\n")
        f.write("CORRELATION METRICS:\n")
        f.write(f"  Spearman Correlation:  {rho:.6f} (p={rho_p:.6f})\n")
        f.write(f"  Pearson Correlation:   {pear:.6f} (p={pear_p:.6f})\n\n")
        f.write("CLASSIFICATION METRICS:\n")
        f.write(f"  Accuracy:              {accuracy:.6f}\n")
        f.write(f"  Precision:             {precision:.6f}\n")
        f.write(f"  Recall:                {recall:.6f}\n")
        f.write(f"  F1 Score:              {f1:.6f}\n\n")
        f.write("CONFUSION MATRIX:\n")
        f.write(f"  TP: {tp}  FP: {fp}\n")
        f.write(f"  FN: {fn}  TN: {tn}\n\n")
        f.write("COVERAGE:\n")
        f.write(f"  Pairs Evaluated: {len(cosine_scores)}/{len(isi_test_pairs)}\n")
        f.write(f"  Coverage Rate:   {len(cosine_scores)/len(isi_test_pairs)*100:.2f}%\n")
    
    print(f"‚úÖ Metrics summary saved to '{metrics_file}'")
    print("\nüéâ Evaluation complete!")

üìÇ Loading corpus from isizulu_corpus.txt...
üöÄ Training FastText model on 1016 sentences...
   Parameters: vector_size=50, window=3, epochs=150, min_n=3, max_n=6

‚úÖ Model trained! Vocabulary size: 11856

CALCULATING COSINE SIMILARITIES
Word 1                    Word 2                    Human      Cosine    
------------------------------------------------------------------------------------------
inkosi                    imeya                     8.45       0.592014  
imali                     isikweletu                7.12       0.008890  
uhulumeni                 umasipala                 8.90       0.150290  
inkohlakalo               icala                     7.50       0.460789  
isikole                   inyuvesi                  8.20       0.516655  
umfundi                   uthisha                   7.65       0.005284  
izifundo                  imiphumela                6.80       0.203102  
u-matric                  isivivinyo                9.10       0.475835  
