# Word2Vec using Gensim

In [None]:
import subprocess
import sys
import os
import csv

# --- AUTO-INSTALLER BLOCK ---
def maintain_dependencies():
    required_libraries = ['numpy', 'scipy', 'gensim']
    for lib in required_libraries:
        try:
            __import__(lib)
        except ImportError:
            print(f"üì¶ Library '{lib}' not found. Installing now...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", lib])

maintain_dependencies()
# ----------------------------

import numpy as np
from scipy.stats import spearmanr, pearsonr
from gensim.models import Word2Vec, FastText

# --- COSINE SIMILARITY ---
def cosine_similarity(vec1, vec2):
    """Calculate cosine similarity: (A ¬∑ B) / (||A|| √ó ||B||)"""
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    
    if norm1 == 0 or norm2 == 0:
        return 0.0
    
    return dot_product / (norm1 * norm2)

# --- CLASSIFICATION METRICS ---
def confusion_matrix_np(y_true, y_pred):
    """Calculate confusion matrix components"""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    return tn, fp, fn, tp

def accuracy_np(tp, tn, fp, fn):
    """Accuracy = (TP + TN) / (TP + TN + FP + FN)"""
    total = tp + tn + fp + fn
    return (tp + tn) / total if total > 0 else 0.0

def precision_np(tp, fp):
    """Precision = TP / (TP + FP)"""
    return tp / (tp + fp) if (tp + fp) > 0 else 0.0

def recall_np(tp, fn):
    """Recall = TP / (TP + FN)"""
    return tp / (tp + fn) if (tp + fn) > 0 else 0.0

def f1_np(precision, recall):
    """F1 Score = 2 * (Precision * Recall) / (Precision + Recall)"""
    return 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

# --- FILE LOADING ---
def load_text_file(filepath):
    """Reads a .txt file and returns a list of tokenized sentences."""
    sentences = []
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                tokens = line.lower().strip().split()
                if tokens:
                    sentences.append(tokens)
        return sentences
    except Exception as e:
        print(f"‚ùå Error reading {filepath}: {e}")
        return []

# --- CREATE SAMPLE CORPUS ---
def create_sample_corpus(filepath):
    """Creates a sample isiZulu corpus for testing"""
    sample_corpus = """umfazi nendoda bahamba esikoleni
ingane idla ukudla kwayo
inja ikati zidlala eyadini
isikole isikhungo semfundo
ikhaya indlu yomndeni
umfula ulwandle amanzi
uthisha umfundi bafunda
isitsha indishi kudla
ibhola umdlalo imidlalo
umuntu ubuntu ubuntu
itheku idolobha amadolobha
incwadi iphepha ukubhala
ikhompiyutha ikhibhodi theknoloji
indiza imoto isitimela ukuhamba
ucingo ukuxhumana uxhumano
umabonakude umsakazo ezindaba
abezindaba umsakazo ukubika
udokotela umhlengikazi ukwelapha
solwazi umfundi ukufunda
inkampani amasheya ukuhweba
isitoko indali ukuthenga
ibhange imali ukonga
ukhuni ihlathi amahlathi
inkosi indlovukazi umbuso
umbhishobhi uRabi unkulunkulu
inyoni iqhude izilwane
ithuluzi ukusebenza
umfana mfowethu umndeni
uhambo imoto ukuhamba
imali idola ingcebo impahla
imali ibhange ukufaka ukuhoxa ukuwasha
ihlosi isilwane izilwane i-zoo
usho njalo isikhathi
uqhawekazi mdikane isibindi
umphiko indiza
usuku ubusuku isikhathi
inzondo ucansi
isinkwa ibhotela ukudla
ikhukhamba izambane imifino
hlakaniphile isilima ukuhlakanipha
ukuzala iqanda
umtapo wezincwadi incwadi
igwaba inkosi
qalisa ithuluzi ukusebenza
ukuhlukumeza isidakamizwa
imeya inkosi amadolobha
isikweletu imali ukuboleka
umasipala uhulumeni isikhungo
inkohlakalo icala ububi
inyuvesi isikole ukufunda
isivivinyo u-matric ukuhlola
ingoma umculo icwecwe
umrepha umculi umculo
ikhwaya umbhalo amaculo
idume izindondo udumo
amaphoyisa abasolwa icala
isibhamu inhlamvu udubula
ubunhloli umkhondo uphenyisiso
isiteshi inkantolo amaphoyisa
ilokishi idolobha indawo
ihhotela isivakashi ukulala
ingqalasizinda ukuthuthukiswa ukwakha
emakhaya iphesheya ezweni
itekisi imoto ukuhamba
ubudokotela udokotela ukwelapha
isifo impilo ukugula
ubumnandi ukujabula injabulo
"""
    
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(sample_corpus)
    print(f"‚úÖ Sample corpus created: {filepath}")

# --- MAIN EXECUTION ---
if __name__ == "__main__":
    CORPUS_FILE = 'isizulu_corpus.txt'
    OUTPUT_CSV = 'isizulu_fasttext_ngram_results.csv'
    
    # Test pairs
    isi_test_pairs = [
        ('inkosi', 'imeya', 8.45),
        ('imali', 'isikweletu', 7.12),
        ('uhulumeni', 'umasipala', 8.90),
        ('inkohlakalo', 'icala', 7.50),
        
        # Imfundo no-Matric
        ('isikole', 'inyuvesi', 8.20),
        ('umfundi', 'uthisha', 7.65),
        ('izifundo', 'imiphumela', 6.80),
        ('u-matric', 'isivivinyo', 9.10),
        
        # Umculo Nobuciko
        ('ingoma', 'icwecwe', 8.55),
        ('umrepha', 'umculi', 9.25),
        ('ikhwaya', 'umbhalo', 4.10),
        ('idume', 'izindondo', 6.40),
        
        # Amaphoyisa Nobugebengu
        ('amaphoyisa', 'abasolwa', 8.15),
        ('isibhamu', 'inhlamvu', 9.40),
        ('ubunhloli', 'umkhondo', 8.70),
        ('isiteshi', 'inkantolo', 6.95),
        
        # Indawo Nezokuvakasha
        ('idolobha', 'ilokishi', 7.30),
        ('isivakashi', 'ihhotela', 8.85),
        ('ingqalasizinda', 'ukuthuthukiswa', 7.75),
        ('emakhaya', 'iphesheya', 3.20),

        # Amagama Angahlobene (Negative Controls)
        ('itekisi', 'ubudokotela', 1.15),
        ('umculo', 'isifo', 0.90),
        ('u-matric', 'ubumnandi', 2.50),
        ('inkosi', 'igwaba', 1.05),
    ]

    # Create sample corpus if it doesn't exist
    if not os.path.exists(CORPUS_FILE):
        print(f"üìù Creating sample corpus file...")
        create_sample_corpus(CORPUS_FILE)

    print(f"\nüìÇ Loading corpus from {CORPUS_FILE}...")
    sentences = load_text_file(CORPUS_FILE)
    
    if not sentences:
        print("üõë The text file is empty.")
        sys.exit(1)

    print(f"‚úÖ Loaded {len(sentences)} sentences")
    
    print(f"\nüöÄ Training FastText model with subword n-grams...")
    print(f"   Architecture: Skip-gram with character n-grams")
    print(f"   Parameters:")
    print(f"      - Vector size: 200")
    print(f"      - Window: 7")
    print(f"      - Min n-gram: 3 (trigrams)")
    print(f"      - Max n-gram: 6 (hexagrams)")
    print(f"      - Epochs: 100")
    print(f"      - Min count: 2")
    print(f"   This enables handling of OOV words and morphological variations!\n")
    
    # Flush output to ensure it's displayed
    sys.stdout.flush()
    
    # Train FastText model with subword information
    model = FastText(
        sentences=sentences,
        vector_size=200,
        window=7,
        min_count=2,
        epochs=100,
        sg=1,  # Skip-gram architecture
        workers=4,
        alpha=0.025,
        min_alpha=0.0001,
        negative=10,
        sample=1e-1,
        # Subword n-gram parameters
        min_n=3,  # Minimum character n-gram length
        max_n=6,  # Maximum character n-gram length
        word_ngrams=1,  # Use word n-grams
        bucket=2000000  # Hash bucket size for n-grams
    )
    
    print(f"‚úÖ FastText model trained! Vocabulary size: {len(model.wv)}")
    print(f"   Subword n-grams: {model.wv.min_n}-{model.wv.max_n} characters")
    print(f"   This model can now handle unseen words!\n")
    sys.stdout.flush()
    
    # Calculate cosine similarities
    print("="*90)
    print("CALCULATING COSINE SIMILARITIES (WITH SUBWORD N-GRAMS)")
    print("="*90)
    
    cosine_scores = []
    human_scores = []
    results = []
    
    print(f"{'Word 1':<25} {'Word 2':<25} {'Human':<10} {'Cosine':<10} {'Status':<15}")
    print("-"*90)
    
    for w1, w2, h_score in isi_test_pairs:
        try:
            # FastText can handle OOV words using subword information
            vec1 = model.wv[w1]
            vec2 = model.wv[w2]
            cos_sim = cosine_similarity(vec1, vec2)
            
            # Check if words are in vocabulary or computed from n-grams
            in_vocab_w1 = w1 in model.wv.key_to_index
            in_vocab_w2 = w2 in model.wv.key_to_index
            
            if in_vocab_w1 and in_vocab_w2:
                status = "In Vocab"
            elif in_vocab_w1 or in_vocab_w2:
                status = "Partial N-gram"
            else:
                status = "Full N-gram"
            
            print(f"{w1:<25} {w2:<25} {h_score:<10.2f} {cos_sim:<10.6f} {status:<15}")
            
            cosine_scores.append(cos_sim)
            human_scores.append(h_score)
            
            results.append({
                'word1': w1,
                'word2': w2,
                'human_score': h_score,
                'cosine_similarity': cos_sim,
                'in_vocab': status
            })
        except Exception as e:
            print(f"{w1:<25} {w2:<25} {h_score:<10.2f} {'ERROR':<10} {str(e):<15}")
    
    print("="*90)
    sys.stdout.flush()
    
    if len(cosine_scores) < 2:
        print("‚ùå Not enough valid pairs to calculate metrics.")
        sys.exit(1)
    
    # --- CALCULATE ALL METRICS ---
    print("\n" + "="*90)
    print("EVALUATION METRICS (FASTTEXT WITH SUBWORD N-GRAMS)")
    print("="*90)
    
    # 1. Correlation Metrics
    rho, rho_p = spearmanr(human_scores, cosine_scores)
    pear, pear_p = pearsonr(human_scores, cosine_scores)
    
    print(f"\nüìä CORRELATION METRICS:")
    print(f"   Spearman Correlation:     {rho:.6f} (p-value: {rho_p:.6f})")
    print(f"   Pearson Correlation:      {pear:.6f} (p-value: {pear_p:.6f})")
    
    # 2. Classification Metrics (binarize using median)
    human_median = np.median(human_scores)
    cosine_median = np.median(cosine_scores)
    
    y_true = (np.array(human_scores) >= human_median).astype(int)
    y_pred = (np.array(cosine_scores) >= cosine_median).astype(int)
    
    tn, fp, fn, tp = confusion_matrix_np(y_true, y_pred)
    accuracy = accuracy_np(tp, tn, fp, fn)
    precision = precision_np(tp, fp)
    recall = recall_np(tp, fn)
    f1 = f1_np(precision, recall)
    
    print(f"\nüìà CLASSIFICATION METRICS (Median Threshold):")
    print(f"   Accuracy:                 {accuracy:.6f}")
    print(f"   Precision:                {precision:.6f}")
    print(f"   Recall:                   {recall:.6f}")
    print(f"   F1 Score:                 {f1:.6f}")
    
    print(f"\nüîç CONFUSION MATRIX:")
    print(f"   True Positives (TP):      {tp}")
    print(f"   True Negatives (TN):      {tn}")
    print(f"   False Positives (FP):     {fp}")
    print(f"   False Negatives (FN):     {fn}")
    
    print(f"\nüìù COVERAGE:")
    print(f"   Pairs Evaluated:          {len(cosine_scores)}/{len(isi_test_pairs)}")
    print(f"   Coverage Rate:            {len(cosine_scores)/len(isi_test_pairs)*100:.2f}%")
    
    print(f"\nüìä SCORE STATISTICS:")
    print(f"   Cosine Similarity:")
    print(f"      Min:     {min(cosine_scores):.6f}")
    print(f"      Max:     {max(cosine_scores):.6f}")
    print(f"      Mean:    {np.mean(cosine_scores):.6f}")
    print(f"      Median:  {cosine_median:.6f}")
    print(f"      Std Dev: {np.std(cosine_scores):.6f}")
    print(f"   Human Scores:")
    print(f"      Min:     {min(human_scores):.2f}")
    print(f"      Max:     {max(human_scores):.2f}")
    print(f"      Mean:    {np.mean(human_scores):.2f}")
    print(f"      Median:  {human_median:.2f}")
    print(f"      Std Dev: {np.std(human_scores):.2f}")
    
    print(f"\nüî§ SUBWORD N-GRAM BENEFITS:")
    print(f"   - Can handle out-of-vocabulary (OOV) words")
    print(f"   - Captures morphological similarities in isiZulu")
    print(f"   - Character n-grams: {model.wv.min_n} to {model.wv.max_n}")
    print(f"   - Example: 'inkosi' ‚Üí n-grams like 'ink', 'nko', 'kos', 'osi', 'inko', 'nkos', 'kosi'")
    
    print("="*90)
    sys.stdout.flush()
    
    # Save to CSV
    with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['word1', 'word2', 'human_score', 'cosine_similarity', 'in_vocab']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        for row in results:
            writer.writerow(row)
    
    print(f"\n‚úÖ Results saved to '{OUTPUT_CSV}'")
    
    # Save metrics summary
    metrics_file = 'fasttext_ngram_evaluation_metrics.txt'
    with open(metrics_file, 'w', encoding='utf-8') as f:
        f.write("ISIZULU WORD SIMILARITY EVALUATION (FASTTEXT WITH SUBWORD N-GRAMS)\n")
        f.write("="*70 + "\n\n")
        f.write("MODEL CONFIGURATION:\n")
        f.write(f"  Architecture:          FastText (Skip-gram with subword n-grams)\n")
        f.write(f"  Character n-grams:     {model.wv.min_n}-{model.wv.max_n}\n")
        f.write(f"  Vector size:           200\n")
        f.write(f"  Window:                7\n")
        f.write(f"  Epochs:                100\n\n")
        f.write("CORRELATION METRICS:\n")
        f.write(f"  Spearman Correlation:  {rho:.6f} (p={rho_p:.6f})\n")
        f.write(f"  Pearson Correlation:   {pear:.6f} (p={pear_p:.6f})\n\n")
        f.write("CLASSIFICATION METRICS:\n")
        f.write(f"  Accuracy:              {accuracy:.6f}\n")
        f.write(f"  Precision:             {precision:.6f}\n")
        f.write(f"  Recall:                {recall:.6f}\n")
        f.write(f"  F1 Score:              {f1:.6f}\n\n")
        f.write("CONFUSION MATRIX:\n")
        f.write(f"  TP: {tp}  FP: {fp}\n")
        f.write(f"  FN: {fn}  TN: {tn}\n\n")
        f.write("COVERAGE:\n")
        f.write(f"  Pairs Evaluated: {len(cosine_scores)}/{len(isi_test_pairs)}\n")
        f.write(f"  Coverage Rate:   {len(cosine_scores)/len(isi_test_pairs)*100:.2f}%\n\n")
        f.write("SUBWORD N-GRAM ADVANTAGES:\n")
        f.write("  - Handles out-of-vocabulary words\n")
        f.write("  - Captures morphological patterns in agglutinative languages\n")
        f.write("  - Better generalization for rare words\n")
        f.write("  - Robust to typos and spelling variations\n")
    
    print(f"‚úÖ Metrics summary saved to '{metrics_file}'")
    
    # Demonstrate OOV capability
    print(f"\nüî¨ DEMONSTRATING OOV WORD HANDLING:")
    print("="*90)
    test_oov_words = ['ukuthenga', 'abantwana', 'izinkomo']
    for test_word in test_oov_words:
        try:
            vec = model.wv[test_word]
            in_vocab = test_word in model.wv.key_to_index
            status = "In vocabulary" if in_vocab else "Computed from n-grams"
            print(f"   '{test_word}': {status} ‚úì")
        except:
            print(f"   '{test_word}': Failed ‚úó")
    print("="*90)
    
    print("\nüéâ FastText evaluation with subword n-grams complete!")
    print("   Subword n-grams enable better handling of morphologically rich languages like isiZulu!")
    sys.stdout.flush()

ImportError: cannot import name 'FastText' from 'gensim.models.word2vec' (C:\Users\USER-PC\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\gensim\models\word2vec.py)