In [1]:
import numpy as np 
import pandas as pd 
import os
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm.notebook import tqdm  
import matplotlib.pyplot as plt
from sklearn.metrics import (
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_curve,
    auc,
    RocCurveDisplay,
    PrecisionRecallDisplay,
)
from sklearn.preprocessing import label_binarize
import seaborn as sns
from sklearn.metrics import classification_report
import re
import unicodedata
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk


# Download VADER lexicon
nltk.download('vader_lexicon')


data_dir = r"C:\Users\User\Desktop\Assignment 3 Resources"
for dirname, _, filenames in os.walk(data_dir):
    for filename in filenames:
        print(os.path.join(dirname, filename))

file_path = os.path.join(data_dir, 'corrected_lexicon.xlsx')
print('Using file_path:', file_path)


C:\Users\User\Desktop\Assignment 3 Resources\africaans_test_corpus.xlsx
C:\Users\User\Desktop\Assignment 3 Resources\afroxlmr_test_predictions.csv
C:\Users\User\Desktop\Assignment 3 Resources\Ciluba_test_corpus (1).xlsx
C:\Users\User\Desktop\Assignment 3 Resources\corpus_test_split.csv
C:\Users\User\Desktop\Assignment 3 Resources\corrected_lexicon.xlsx
C:\Users\User\Desktop\Assignment 3 Resources\french_test_corpus.xlsx
C:\Users\User\Desktop\Assignment 3 Resources\machine-learning-on-the-multilingual-lexicon (1).ipynb
C:\Users\User\Desktop\Assignment 3 Resources\test_corpus_processed.csv
C:\Users\User\Desktop\Assignment 3 Resources\zulu_test_corpus.xlsx
C:\Users\User\Desktop\Assignment 3 Resources\Assignment 3 Resources\africaans_test_corpus.xlsx
C:\Users\User\Desktop\Assignment 3 Resources\Assignment 3 Resources\Ciluba_test_corpus (1).xlsx
C:\Users\User\Desktop\Assignment 3 Resources\Assignment 3 Resources\corrected_lexicon.xlsx
C:\Users\User\Desktop\Assignment 3 Resources\Assignment 

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
# Local path for corrected lexicon
file_path = r"C:\Users\User\Desktop\Assignment 3 Resources\corrected_lexicon.xlsx"

# Load the dataset (guard against missing file)
if os.path.exists(file_path):
    df1 = pd.read_excel(file_path)
    print(f"Loaded {file_path} with shape {df1.shape}")
else:
    raise FileNotFoundError(f"Expected lexicon file not found at: {file_path}")

df1.head()


Loaded C:\Users\User\Desktop\Assignment 3 Resources\corrected_lexicon.xlsx with shape (3234, 9)


Unnamed: 0,CILUBA,FRANCAIS,ENGLISH,AFRIKAANS,ZULU,Sepedi,SCORE,SENTIMENT,NATURE
0,umue,un,a,N,I,,0,Neutre,nombre
1,Biabunyi,beaucoup,a lot,baie,okuningi,ga ntši,3,Positif,mot
2,bungi,beaucoup,a lot,baie,okuningi,kudu,0,Neutre,adverbe
3,dilekela,abandon,abandonment,Verlating,Ukulahlwa,hlokomologa,4,Positif,mot
4,Kulekela,abandon,abandonment,verlating,ukulahlwa,hlokomologa,3,Positif,mot


In [3]:
# Rename columns
df1.columns = ['ciluba', 'french', 'english', 'afrikaans', 'zulu', 'sepedi', 'score', 'sentiment', 'nature']

# Drop duplicate rows based on language columns + sentiment (or all columns if you prefer)
df = df1.drop_duplicates(subset=['ciluba', 'french', 'english', 'afrikaans', 'zulu', 'sepedi', 'sentiment'])

# Drop rows with missing values in the language columns + sentiment
df = df1.dropna(subset=['ciluba', 'french', 'english', 'afrikaans', 'zulu', 'sepedi', 'sentiment'])

# Now you can proceed to vectorize, reduce and plot based on these language columns as separate features or however you want.
# For example, you could vectorize each language column separately or combine them as needed.
df



Unnamed: 0,ciluba,french,english,afrikaans,zulu,sepedi,score,sentiment,nature
1,Biabunyi,beaucoup,a lot,baie,okuningi,ga ntši,3,Positif,mot
2,bungi,beaucoup,a lot,baie,okuningi,kudu,0,Neutre,adverbe
3,dilekela,abandon,abandonment,Verlating,Ukulahlwa,hlokomologa,4,Positif,mot
4,Kulekela,abandon,abandonment,verlating,ukulahlwa,hlokomologa,3,Positif,mot
5,kulekela,abandon,abandonment,Verlating,Ukulahlwa,hlokomologa,4,Positif,mot
...,...,...,...,...,...,...,...,...,...
3200,Ya Bunda,construire,build,Bou,Yakha,aga,5,Positive,verbe
3201,Ya ku leta,faire venir,bring,Bring,Letha,tliša,4,Positive,verbe
3202,Yamba,prendre,take,Neem,Thatha,tšeya,3,Positive,verbe
3205,Yeleka,espérer,hope,Hoop,Themba,tshepo,9,Positive,verbe


In [4]:

# ================================
# STEP 1: Load Data
# ================================
# Local files in data_dir set earlier in cell 1 
lexicon_path = os.path.join(r"C:\Users\User\Desktop\Assignment 3 Resources", 'corrected_lexicon.xlsx')
french_test_path = os.path.join(r"C:\Users\User\Desktop\Assignment 3 Resources", 'french_test_corpus.xlsx')

df = pd.read_excel(lexicon_path)
test_corpus_df = pd.read_excel(french_test_path)

# ================================
# STEP 2: Preprocess Lexicon
# ================================
# Rename columns
df.columns = ['ciluba', 'french', 'english', 'afrikaans', 'zulu', 'sepedi', 'score', 'sentiment', 'nature']

# Define languages
supported_languages = ['french', 'afrikaans', 'zulu', 'ciluba', 'sepedi', 'english']

# ================================
# STEP 3: Translation Dictionary
# ================================
def clean_text_for_matching(text):
    text = re.sub(r'[^\w\s]', '', str(text))
    return text.lower()

def create_translation_dicts(df, languages):
    translation_dicts = {src: {tgt: {} for tgt in languages if tgt != src} for src in languages}
    for _, row in df.iterrows():
        for src in languages:
            source_phrase = str(row[src]).strip().lower()
            if pd.isna(source_phrase) or source_phrase == '':
                continue
            for tgt in languages:
                if tgt == src:
                    continue
                target_phrase = str(row[tgt]).strip().lower()
                if pd.isna(target_phrase) or target_phrase == '':
                    continue
                translation_dicts[src][tgt][source_phrase] = target_phrase
    return translation_dicts

translation_dicts = create_translation_dicts(df, supported_languages)
print("✅ Translation dictionaries created.")

# ================================
# STEP 4: Sentiment Scores
# ================================
sentiment_averages = {}
all_sentiments = {}

for lang in supported_languages:
    sentiment_averages[lang] = df.groupby(lang)['score'].mean().to_dict()
    all_sentiments[lang] = df.groupby(lang)['score'].apply(list).to_dict()

print("✅ Sentiment scores calculated.")

# ================================
# STEP 5: Custom Sentiment Function
# ================================
def compute_sentiment_v2(scores):
    if len(scores) == 1:
        return scores[0]
    elif len(scores) == 2:
        return max(scores, key=abs)
    else:
        pos = [s for s in scores if s > 0]
        neg = [s for s in scores if s < 0]
        if len(pos) >= len(neg):
            return sum(pos) / len(pos) if pos else 0
        else:
            return sum(neg) / len(neg) if neg else 0

# ================================
# STEP 6: Translate & Analyze Function
# ================================
def translate_analyze_sentiments_with_vader(text, source_lang, target_lang,
                                            translation_dicts, sentiment_averages, all_sentiments,
                                            vader_analyzer):
    source_lang = str(source_lang).lower()
    target_lang = str(target_lang).lower()

    if source_lang not in translation_dicts or target_lang not in translation_dicts[source_lang]:
        return {k: '' if isinstance(v, str) else 0 for k, v in {
            "translated_text": "",
            "total_score_avg": 0,
            "word_scores_avg": "",
            "sentiment_avg": "neutral",
            "total_score_v2": 0,
            "word_scores_v2": "",
            "sentiment_v2": "neutral",
            "vader_positive": 0,
            "vader_negative": 0,
            "vader_neutral": 0,
            "vader_compound": 0,
            "vader_sentiment": "neutral"
        }.items()}

    cleaned_text = clean_text_for_matching(text)
    words = cleaned_text.split()
    translated_sentence = []
    total_score_avg = 0
    total_score_v2 = 0
    word_scores_avg = []
    word_scores_v2 = []

    i = 0
    while i < len(words):
        matched_phrase = None
        translated_phrase = None
        phrase_score_avg = 0
        phrase_scores_v2 = 0
        max_length = min(5, len(words) - i)

        for j in range(max_length, 0, -1):
            phrase = ' '.join(words[i:i+j])
            if phrase in translation_dicts[source_lang][target_lang]:
                matched_phrase = phrase
                translated_phrase = translation_dicts[source_lang][target_lang][phrase]
                phrase_score_avg = sentiment_averages[source_lang].get(phrase, 0)
                phrase_scores = all_sentiments[source_lang].get(phrase, [])
                phrase_scores_v2 = compute_sentiment_v2(phrase_scores)
                i += j
                break

        if matched_phrase:
            translated_sentence.append(translated_phrase)
            total_score_avg += phrase_score_avg
            total_score_v2 += phrase_scores_v2
            word_scores_avg.append(f"{matched_phrase}:{phrase_score_avg}")
            word_scores_v2.append(f"{matched_phrase}:{phrase_scores_v2}")
        else:
            word = words[i]
            translated_word = translation_dicts[source_lang][target_lang].get(word, word)
            translated_sentence.append(translated_word)
            score_avg = sentiment_averages[source_lang].get(word, 0)
            scores = all_sentiments[source_lang].get(word, [])
            score_v2 = compute_sentiment_v2(scores)
            total_score_avg += score_avg
            total_score_v2 += score_v2
            word_scores_avg.append(f"{word}:{score_avg}")
            word_scores_v2.append(f"{word}:{score_v2}")
            i += 1

    translated_text = ' '.join(translated_sentence).strip()
    sentiment_avg = "positive" if total_score_avg > 0.05 else "negative" if total_score_avg < -0.05 else "neutral"
    sentiment_v2 = "positive" if total_score_v2 > 0.05 else "negative" if total_score_v2 < -0.05 else "neutral"

    vader_scores = vader_analyzer.polarity_scores(text)
    vader_sentiment = "positive" if vader_scores['compound'] >= 0.05 else "negative" if vader_scores['compound'] <= -0.05 else "neutral"

    return {
        "translated_text": translated_text,
        "total_score_avg": total_score_avg,
        "word_scores_avg": '; '.join(word_scores_avg),
        "sentiment_avg": sentiment_avg,
        "total_score_v2": total_score_v2,
        "word_scores_v2": '; '.join(word_scores_v2),
        "sentiment_v2": sentiment_v2,
        "vader_positive": vader_scores['pos'],
        "vader_negative": vader_scores['neg'],
        "vader_neutral": vader_scores['neu'],
        "vader_compound": vader_scores['compound'],
        "vader_sentiment": vader_sentiment
    }

# ================================
# STEP 7: Apply to Test Corpus
# ================================
def safe_translate_and_analyze_sentiments_with_vader(row, translation_dicts, sentiment_averages, all_sentiments, vader_analyzer):
    try:
        return pd.Series(translate_analyze_sentiments_with_vader(
            row.get('sentence', ''),
            row.get('source_language', ''),
            row.get('target_language', ''),
            translation_dicts,
            sentiment_averages,
            all_sentiments,
            vader_analyzer
        ))
    except Exception as e:
        print(f"Error in row {row.name}: {e}")
        return pd.Series({
            "translated_text": "",
            "total_score_avg": 0,
            "word_scores_avg": "",
            "sentiment_avg": "neutral",
            "total_score_v2": 0,
            "word_scores_v2": "",
            "sentiment_v2": "neutral",
            "vader_positive": 0,
            "vader_negative": 0,
            "vader_neutral": 0,
            "vader_compound": 0,
            "vader_sentiment": "neutral"
        })

# Run the sentiment analysis
vader_analyzer = SentimentIntensityAnalyzer()
# Ensure we have the expected columns in test_corpus_df
print('Test corpus columns:', test_corpus_df.columns.tolist())
if 'sentence' not in test_corpus_df.columns:
    raise KeyError("Expected 'sentence' column in test corpus. Found: " + ','.join(test_corpus_df.columns.astype(str)))

# Apply translation+sentiment
test_corpus_df = test_corpus_df.copy()

# For speed in this environment, process a sample (or full if small)
sample_size = min(len(test_corpus_df), 500)
print(f"Processing {sample_size} rows (of {len(test_corpus_df)})")

test_corpus_df_sample = test_corpus_df.iloc[:sample_size].copy()

results = test_corpus_df_sample.apply(
    lambda row: safe_translate_and_analyze_sentiments_with_vader(
        row,
        translation_dicts,
        sentiment_averages,
        all_sentiments,
        vader_analyzer
    ), axis=1
)

# Join results back
test_corpus_df = test_corpus_df_sample.join(results)

print("✅ Sentiment analysis applied to test corpus (sample).")

# ================================
# STEP 8: Optional Evaluation
# ================================
# Map sentiment strings to numeric
sentiment_mapping = {'negative': -1, 'neutral': 0, 'positive': 1}
if 'sentiment_v2' in test_corpus_df.columns:
    test_corpus_df['custom_sentiment_numeric'] = test_corpus_df['sentiment_v2'].map(sentiment_mapping)
if 'vader_sentiment' in test_corpus_df.columns:
    test_corpus_df['vader_sentiment_numeric'] = test_corpus_df['vader_sentiment'].map(sentiment_mapping)

# ================================
# STEP 9: Display Results
# ================================
print("📊 Preview of sentiment analysis results:")
display(test_corpus_df.head(20))


✅ Translation dictionaries created.
✅ Sentiment scores calculated.
Test corpus columns: ['source_language', 'target_language', 'sentence']
Processing 500 rows (of 2999)
✅ Sentiment analysis applied to test corpus (sample).
📊 Preview of sentiment analysis results:


Unnamed: 0,source_language,target_language,sentence,translated_text,total_score_avg,word_scores_avg,sentiment_avg,total_score_v2,word_scores_v2,sentiment_v2,vader_positive,vader_negative,vader_neutral,vader_compound,vader_sentiment,custom_sentiment_numeric,vader_sentiment_numeric
0,french,english,Arrange pagne proteger Comportement Seulement,arrange loincloth protect behavior only,10.6,arrange:1.0; pagne:2.0; proteger:3.0; comporte...,positive,11.6,arrange:1; pagne:3; proteger:3; comportement:1...,positive,0.0,0.0,1.0,0.0,neutral,1,0
1,french,ciluba,Rearrange mordre purifier Vérité bourse,akajilula kusuma kutokesha bulelela tshibombu,8.566667,rearrange:1.0; mordre:-2.0; purifier:3.6666666...,positive,9.066667,rearrange:1; mordre:-2; purifier:3.66666666666...,positive,0.0,0.0,1.0,0.0,neutral,1,0
2,french,afrikaans,Parle aisé Serpent Mère Abhorrer,praat maklik slang moeder verafsku,9.0,parle:2.0; aisé:3.0; serpent:-2.25; mère:2.25;...,positive,7.25,parle:2; aisé:3; serpent:-4.0; mère:2.25; abho...,positive,0.0,0.0,1.0,0.0,neutral,1,0
3,french,zulu,Parler à nouveau murmurer chanter Sein castrer,khuluma futhi nyenyeza cula isibele xholosa,16.642857,parler à nouveau:2.0; murmurer:4.0; chanter:3....,positive,16.642857,parler à nouveau:2; murmurer:4; chanter:3.1428...,positive,0.0,0.0,1.0,0.0,neutral,1,0
4,french,english,Remet Déchirure Infidèle Étrangler Kubela,put back tear unfaithful strangle kubela,-1.8,remet:3.2; déchirure:-4.0; infidèle:-5.0; étra...,negative,-1.8,remet:3.2; déchirure:-4; infidèle:-5; étrangle...,negative,0.0,0.0,1.0,0.0,neutral,-1,0
5,french,ciluba,Dis Fétiche finir Rêver corps,amba manga tshinda kulota mubidimbidi,6.714286,dis:3.0; fétiche:-3.0; finir:1.333333333333333...,positive,9.380952,dis:3; fétiche:-3.0; finir:4.0; rêver:2.666666...,positive,0.0,0.0,1.0,0.0,neutral,1,0
6,french,afrikaans,Superposer Preparer bruit cérémoniecoutumière ...,superponeer voorberei geraas gebruiklike serem...,13.266667,superposer:2.2; preparer:3.4; bruit:2.66666666...,positive,13.266667,superposer:2.2; preparer:3.4; bruit:2.66666666...,positive,0.0,0.0,1.0,0.0,neutral,1,0
7,french,zulu,Ramasse parfaite voler cueillire trente-cinq,phakamisa gweda ndiza ukhethiwe trentecinq,3.8,ramasse:4.0; parfaite:3.0; voler:-4.2; cueilli...,positive,4.8,ramasse:4; parfaite:4; voler:-4.2; cueillire:1...,positive,0.0,0.0,1.0,0.0,neutral,1,0
8,french,english,Dépêche maladie chapeau rouler repasser,dispatch illness hat to roll go back,9.55,dépêche:4.0; maladie:-0.7; chapeau:2.75; roule...,positive,8.25,dépêche:4; maladie:-2.5; chapeau:2.75; rouler:...,positive,0.0,0.0,1.0,0.0,neutral,1,0
9,french,ciluba,Répète Galère Expliquer trasformer voler,ambulula dikenga kuvuija kukudimuna kuiba,0.183333,répète:2.8; galère:-3.75; expliquer:4.33333333...,positive,0.183333,répète:2.8; galère:-3.75; expliquer:4.33333333...,positive,0.0,0.0,1.0,0.0,neutral,1,0


In [5]:
# Save the processed test corpus (current `test_corpus_df` in notebook)
import os
out_path = os.path.join(data_dir, 'test_corpus_processed.csv')
# Ensure DataFrame exists
if 'test_corpus_df' in globals():
    try:
        test_corpus_df.to_csv(out_path, index=False, encoding='utf-8')
        print(f"Saved processed test_corpus_df to: {out_path}")
    except Exception as e:
        print('Error saving CSV:', e)
else:
    print('test_corpus_df not found in the notebook namespace.')


Saved processed test_corpus_df to: C:\Users\User\Desktop\Assignment 3 Resources\test_corpus_processed.csv


In [6]:
# ================================
# STEP 1: Install Required Libraries (SIMPLIFIED)
# ================================
# Run this cell AFTER the NumPy fix cell above and kernel restart

import subprocess
import sys

print("📦 Installing transformer packages...")
print("=" * 60)

# Install packages one by one with proper error handling
packages = [
    ('transformers', 'transformers'),
    ('torch', 'torch --index-url https://download.pytorch.org/whl/cpu'),
    ('sentencepiece', 'sentencepiece'),
]

for name, install_cmd in packages:
    print(f"\n📥 Installing {name}...")
    try:
        result = subprocess.run(
            [sys.executable, '-m', 'pip', 'install'] + install_cmd.split(),
            capture_output=True,
            text=True,
            timeout=300
        )
        if result.returncode == 0:
            print(f"   ✅ {name} installed")
        else:
            print(f"   ⚠️ {name} may have issues: {result.stderr[:200]}")
    except Exception as e:
        print(f"   ❌ Error with {name}: {str(e)[:200]}")

print("\n" + "=" * 60)
print("✅ Installation complete!")
print("\n⚠️ IMPORTANT: Restart the kernel now!")
print("   Go to: Kernel → Restart Kernel")

📦 Installing transformer packages...

📥 Installing transformers...
   ✅ transformers installed

📥 Installing torch...
   ✅ torch installed

📥 Installing sentencepiece...
   ✅ sentencepiece installed

✅ Installation complete!

⚠️ IMPORTANT: Restart the kernel now!
   Go to: Kernel → Restart Kernel


In [7]:
# ================================
# STEP 1b: Verify Installation (Run this after kernel restart)
# ================================
# Run this cell to check if packages are properly installed

import sys

def check_package(package_name):
    try:
        __import__(package_name)
        return True
    except ImportError:
        return False

packages_to_check = {
    'torch': 'PyTorch',
    'transformers': 'Transformers',
    'sentencepiece': 'SentencePiece',
}

print("Checking installed packages...")
print("=" * 50)
all_installed = True

for pkg, name in packages_to_check.items():
    if check_package(pkg):
        print(f"✅ {name} ({pkg}): Installed")
    else:
        print(f"❌ {name} ({pkg}): NOT installed")
        all_installed = False

print("=" * 50)
if all_installed:
    print("\n✅ All packages are installed! You can proceed.")
else:
    print("\n❌ Some packages are missing. Please:")
    print("   1. Run the installation cell above")
    print("   2. Restart the kernel")
    print("   3. Run this cell again")

Checking installed packages...
✅ PyTorch (torch): Installed


  from .autonotebook import tqdm as notebook_tqdm


✅ Transformers (transformers): Installed
✅ SentencePiece (sentencepiece): Installed

✅ All packages are installed! You can proceed.


In [8]:
# ================================
# EVERYTHING BELOW IS WITH REGARDS TO MODEL TRAINING 
# ================================

In [9]:
# ================================
# STEP 2: Load Transformer Models and Prepare Data
# ================================
# Run this cell AFTER restarting the kernel following package installation

import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Model names from HuggingFace
afroxlmr_model = "Davlan/afro-xlmr-base"  # AfroXLMR base model
afriberta_model = "castorini/afriberta_base"  # AfriBERTa base model

In [10]:
#CELLS BELOW IS CREATING THREE TYPES OF DATA:
#STORING LEXICON INDIVIDUAL WORDS
#CREATING SENTENCES FROM LESXICON WORDS
#USING CORPUS SENTENCES BOTH TRANSLATED AND ORIGINAL

In [11]:
# ================================
# THIS CELL IS PURELY FOR LEXICON INDIVIDUAL WORDS - WE DONT HAVE TO INCLUDE JUST IN CASE
# ================================
# We'll use the lexicon data to create labeled training examples
# The lexicon has sentiment labels we can use for fine-tuning


# Problem: Lexicon has mixed French/English labels ("Positif", "positive", "Negatif", etc.)
# Solution: Convert all variants to consistent English lowercase labels
def normalize_sentiment(sentiment):
    sentiment = str(sentiment).lower().strip()  # Convert to lowercase and remove spaces
    
    # Map all variants to standard labels
    if sentiment in ['positif', 'positive']:
        return 'positive'
    elif sentiment in ['negatif', 'negative']:
        return 'negative'
    elif sentiment in ['neutre', 'neutral']:
        return 'neutral'
    else:
        return None  # Invalid labels will be removed later

# Apply normalization to all sentiment labels
df['sentiment_normalized'] = df['sentiment'].apply(normalize_sentiment)

# Remove rows with unmapped/invalid sentiments
df_clean = df[df['sentiment_normalized'].notna()].copy()


# --------------------------------------------
# 2. EXTRACT WORDS FROM EACH LANGUAGE
# --------------------------------------------
# For each language column (French, English, Zulu, etc.), extract the word + sentiment
# This creates separate training examples for each language's words
# Example: "beaucoup" (French) → positive, "a lot" (English) → positive

train_data = []  # Will hold data from all languages

for lang in supported_languages:  # Loop through: french, afrikaans, zulu, ciluba, sepedi, english
    # Extract the language column and sentiment
    temp_df = df_clean[[lang, 'sentiment_normalized']].copy()
    temp_df.columns = ['text', 'sentiment']
    
    # Clean up the data
    temp_df = temp_df.dropna(subset=['text', 'sentiment'])  # Remove empty cells
    temp_df['text'] = temp_df['text'].astype(str)           # Ensure text is string
    temp_df = temp_df[temp_df['text'].str.strip() != '']    # Remove blank strings
    temp_df = temp_df[temp_df['text'].str.lower() != 'nan'] # Remove "nan" strings
    
    # Add this language's data to the list
    train_data.append(temp_df)

# --------------------------------------------
# 3. COMBINE ALL LANGUAGES
# --------------------------------------------
# Merge all language data into one big training set
# Result: bungi, beaucoup, a lot, baie, okuningi, kudu all become separate training examples
train_df = pd.concat(train_data, ignore_index=True)
# Remove duplicate words (if same word appears in multiple rows)
train_df = train_df.drop_duplicates(subset=['text'])

# --------------------------------------------
# 4. CONVERT SENTIMENT TO NUMBERS
# --------------------------------------------
# Transformers need numeric labels, not text
# negative → 0, neutral → 1, positive → 2
sentiment_to_label = {'negative': 0, 'neutral': 1, 'positive': 2}  # Text → Number
label_to_sentiment = {0: 'negative', 1: 'neutral', 2: 'positive'}  # Number → Text (for later)

train_df['label'] = train_df['sentiment'].map(sentiment_to_label)

# Remove any rows where mapping failed (shouldn't happen, but just in case)
train_df = train_df.dropna(subset=['label'])
train_df['label'] = train_df['label'].astype(int)  # Ensure label is integer

# --------------------------------------------
# 5. DISPLAY RESULTS
# --------------------------------------------
print(f"\n✅ Training data prepared:")
print(f"   Total examples: {len(train_df)}")
print(f"   Label distribution:\n{train_df['label'].value_counts()}")
print(f"\n   Sample examples:")
display(train_df.head(10))


✅ Training data prepared:
   Total examples: 10454
   Label distribution:
label
2    8599
0     980
1     875
Name: count, dtype: int64

   Sample examples:


Unnamed: 0,text,sentiment,label
0,un,neutral,1
1,beaucoup,positive,2
3,abandon,positive,2
6,abhorrer,positive,2
7,capacité,positive,2
8,abolir,positive,2
9,abolition,positive,2
10,abominable,positive,2
11,avorter,negative,0
12,absence,positive,2


In [12]:
# ================================
# CREATING SENTENCES USING LEXICON WORDS FOR MORE DATA - MAYBE LEXICON TEAM SHOULD DO THIS BUT JUST IN CASE ITS HERE
# ================================
# Convert isolated lexicon words into sentence contexts

print("🔄 Augmenting lexicon words into sentence templates...")

# Sentence templates for different languages
templates_by_lang = {
    'french': [
        "Je trouve que {} est important.",
        "C'est {}.",
        "Le mot {} exprime un sentiment.",
        "{} dans cette phrase."
    ],
    'english': [
        "I think {} is important.",
        "This is {}.",
        "The word {} expresses a feeling.",
        "{} in this sentence."
    ],
    'afrikaans': [
        "Dit is {}.",
        "Die woord {} is belangrik.",
        "{} in hierdie sin."
    ],
    'zulu': [
        "Lokhu {}.",
        "Igama {} libalulekile.",
        "{} kulesi sigaba."
    ],
    'ciluba': [
        "Ici {}.",
        "Ijambu {} lidipingana.",
        "{} mumpanzu."
    ],
    'sepedi': [
        "Se ke {}.",
        "Lentšu {} le bohlokwa.",
        "{} mo polelong ye."
    ]
}

# Create augmented dataset
aug_data = []

for _, row in df_clean.iterrows():
    sentiment = row['sentiment_normalized']
    if pd.isna(sentiment):
        continue
    
    # For each language, create template-based sentences
    for lang in supported_languages:
        word = str(row[lang]).strip()
        if not word or pd.isna(word) or word.lower() == 'nan':
            continue
        
        # Use language-specific templates if available, else use French templates
        templates = templates_by_lang.get(lang, templates_by_lang['french'])
        
        # Create 2-3 sentences per word (not all templates to avoid too much data)
        for template in templates[:2]:
            try:
                sentence = template.format(word)
                aug_data.append({
                    'text': sentence,
                    'sentiment': sentiment,
                    'source': 'lexicon_augmented',
                    'language': lang
                })
            except:
                continue

aug_df = pd.DataFrame(aug_data)
print(f"✅ Created {len(aug_df)} augmented sentences from lexicon")
print(f"   Distribution: {aug_df['sentiment'].value_counts().to_dict()}")

🔄 Augmenting lexicon words into sentence templates...
✅ Created 38008 augmented sentences from lexicon
   Distribution: {'positive': 32128, 'negative': 3440, 'neutral': 2440}


In [13]:
# ================================
# LOADING CORPUS DATA AND SPLITTING TRAIN/TEST WE ALSO INCLUDE TRANSLATED SENTENCES
# ================================

# Load the processed corpus
corpus_path = os.path.join(data_dir, 'test_corpus_processed.csv')
if os.path.exists(corpus_path):
    from sklearn.model_selection import train_test_split
    
    corpus_df = pd.read_csv(corpus_path)
    print(f"✅ Loaded corpus with {len(corpus_df)} sentences")
    
    # Extract French sentences with sentiment_v2 labels
    corpus_data = corpus_df[['sentence', 'sentiment_v2']].copy()
    corpus_data.columns = ['text', 'sentiment']
    corpus_data = corpus_data.dropna(subset=['text', 'sentiment'])
    
    # SPLIT: 70% train, 30% test (stratified by sentiment)
    corpus_train, corpus_test, train_indices, test_indices = train_test_split(
        corpus_data,
        corpus_data.index,  # Keep track of indices for translation lookup
        test_size=0.3,
        random_state=42,
        stratify=corpus_data['sentiment']
    )
    
    corpus_train['source'] = 'corpus'
    corpus_train['language'] = 'french'
    
    print(f"✅ Corpus split completed:")
    print(f"   Training: {len(corpus_train)} French sentences")
    print(f"   Testing: {len(corpus_test)} French sentences (held-out for evaluation)")
    
    # ========================================
    # ADD TRANSLATED SENTENCES
    # ========================================
    
    # Check if translated_text and target_language columns exist
    if 'translated_text' in corpus_df.columns and 'target_language' in corpus_df.columns:
        translated_train_data = []
        
        # For each training sample, get its translated version
        for idx in train_indices:
            row = corpus_df.loc[idx]
            
            # Check if translation exists and is not empty
            translated_text = str(row.get('translated_text', '')).strip()
            target_lang = str(row.get('target_language', '')).strip().lower()
            sentiment = row.get('sentiment_v2', '')
            
            if translated_text and translated_text != 'nan' and len(translated_text) > 0:
                # Only add if target language is different from French
                if target_lang and target_lang != 'french':
                    translated_train_data.append({
                        'text': translated_text,
                        'sentiment': sentiment,
                        'source': 'corpus_translated',
                        'language': target_lang
                    })
        
        # Add translated sentences to training data
        if translated_train_data:
            corpus_train_translated = pd.DataFrame(translated_train_data)
            corpus_train = pd.concat([corpus_train, corpus_train_translated], ignore_index=True)
            
            print(f"   ✅ Added {len(translated_train_data)} translated sentences")
            print(f"   Languages distribution:")
            lang_counts = corpus_train_translated['language'].value_counts()
            for lang, count in lang_counts.items():
                print(f"      - {lang}: {count} sentences")
            print(f"\n   📊 Total training sentences: {len(corpus_train)}")
            print(f"      - French (original): {(corpus_train['source'] == 'corpus').sum()}")
            print(f"      - Translated: {(corpus_train['source'] == 'corpus_translated').sum()}")
        else:
            print(f"   ⚠️ No valid translations found in corpus")
    else:
        print(f"   ℹ️ No 'translated_text' or 'target_language' columns found")
        print(f"   Available columns: {corpus_df.columns.tolist()}")
        print(f"   Continuing with French sentences only")
    
    print(f"\n   Train sentiment distribution: {corpus_train['sentiment'].value_counts().to_dict()}")
    print(f"   Test sentiment distribution: {corpus_test['sentiment'].value_counts().to_dict()}")
    
    # Save test split for later evaluation
    test_split_path = os.path.join(data_dir, 'corpus_test_split.csv')
    corpus_test.to_csv(test_split_path, index=False)
    print(f"\n   💾 Saved held-out test set to: corpus_test_split.csv")
else:
    print(f"⚠️ Corpus file not found at {corpus_path}")
    print("   Will use only lexicon data for training")
    corpus_train = pd.DataFrame(columns=['text', 'sentiment', 'source', 'language'])
    corpus_test = pd.DataFrame(columns=['text', 'sentiment'])



✅ Loaded corpus with 500 sentences
✅ Corpus split completed:
   Training: 350 French sentences
   Testing: 150 French sentences (held-out for evaluation)
   ✅ Added 350 translated sentences
   Languages distribution:
      - afrikaans: 96 sentences
      - english: 87 sentences
      - zulu: 84 sentences
      - ciluba: 83 sentences

   📊 Total training sentences: 700
      - French (original): 350
      - Translated: 350

   Train sentiment distribution: {'positive': 664, 'negative': 34, 'neutral': 2}
   Test sentiment distribution: {'positive': 142, 'negative': 7, 'neutral': 1}

   💾 Saved held-out test set to: corpus_test_split.csv


In [14]:
#CELLS ABOVE IS CREATING THREE TYPES OF DATA:
#STORING LEXICON INDIVIDUAL WORDS
#CREATING SENTENCES FROM LESXICON WORDS
#USING CORPUS SENTENCES BOTH TRANSLATED AND ORIGINAL

In [15]:
# ================================
# COMBINE ALL TRAINING DATA SOURCES
# ================================
#TRAINING DATA - If we want to include lexicon singular words
lexicon_original = train_df[['text', 'sentiment']].copy() # Individual words from lexicon and sentiment values: triste - negative
lexicon_original['source'] = 'lexicon_original'  # Adds column called source to see where it came from: triste - negative - lexicon_original
lexicon_original['language'] = 'mixed' # Add columns called language to indicate mixed languages

# Takes data from sources and combines them into one big dataframe for training
combined_train_df = pd.concat([
    # lexicon_original, COMMENTED OUT: unless we want model to have context of specific words
    # aug_df[['text', 'sentiment', 'source', 'language']], COMMENTED OUT: ~7,000 CREATED sentences from lexicon (uncomment to use)
    corpus_train[['text', 'sentiment', 'source', 'language']] # Uses Corpus sentences with sentiment values
], ignore_index=True)

# Remove duplicates
combined_train_df = combined_train_df.drop_duplicates(subset=['text'])

# For each word/sentence it looks at the sentiment value and maps it to a number
combined_train_df['label'] = combined_train_df['sentiment'].map(sentiment_to_label)
combined_train_df = combined_train_df.dropna(subset=['label'])
combined_train_df['label'] = combined_train_df['label'].astype(int)


#JUST SOME LOGGING TO SEE THE DATA THE MODELS WILL USE
print(f"\n✅ Combined training data created:")
print(f"   Total examples: {len(combined_train_df)}")
print(f"   - Lexicon original: {(combined_train_df['source'] == 'lexicon_original').sum()}")
print(f"   - Lexicon augmented: {(combined_train_df['source'] == 'lexicon_augmented').sum()}")
print(f"   - Corpus (French): {(combined_train_df['source'] == 'corpus').sum()}")
print(f"   - Corpus (Translated): {(combined_train_df['source'] == 'corpus_translated').sum()}")
print(f"\n   Language distribution:")
if 'language' in combined_train_df.columns:
    print(combined_train_df['language'].value_counts().to_dict())
print(f"\n   Label distribution:")
print(combined_train_df['label'].value_counts().sort_index())
print(f"\n   Sample from each source:")
for src in combined_train_df['source'].unique():
    sample = combined_train_df[combined_train_df['source'] == src].head(2)
    print(f"\n   {src}:")
    for _, row in sample.iterrows():
        lang_info = f" [{row.get('language', 'unknown')}]" if 'language' in row else ""
        print(f"      {row['text'][:60]}...{lang_info} → {row['sentiment']}")

# Update train_df to use combined data
train_df = combined_train_df[['text', 'label']].copy()
print(f"\n✅ Ready to train with {len(train_df)} examples!")


✅ Combined training data created:
   Total examples: 700
   - Lexicon original: 0
   - Lexicon augmented: 0
   - Corpus (French): 350
   - Corpus (Translated): 350

   Language distribution:
{'french': 350, 'afrikaans': 96, 'english': 87, 'zulu': 84, 'ciluba': 83}

   Label distribution:
label
0     34
1      2
2    664
Name: count, dtype: int64

   Sample from each source:

   corpus:
      coude lit Dehors galère captif... [french] → positive
      Courir Larme Doigt combo Poing... [french] → positive

   corpus_translated:
      lukenyibu bulalu kuya dikenga mupika... [ciluba] → positive
      gijima izinyembezi umunwe isivalo inqindi... [zulu] → positive

✅ Ready to train with 700 examples!


In [16]:
#BELOW WE FINE TUNE & TRAIN AfroXLMR USING THE COMBINED TRAINING DATA

In [17]:
# ================================
# FINE TUNE AfroXLMR - Getting everything ready to train AfroXLMR model
# ================================

# ==================================================================================
#Import classes from hugging face transformers
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding
)
#To split train and test data
from sklearn.model_selection import train_test_split
#Evaluation metrix to test model performance
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
#Deep learning framework
import torch
from torch.utils.data import Dataset
# ==================================================================================

# ==================================================================================
# Create PyTorch Dataset it will accept texts ("Je suis triste", "C'est beaucoup") and labels (0, 1, 2)
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)
# ==================================================================================

# ==================================================================================
# Split data into train/validation
train_texts = train_df['text'].tolist()
train_labels = train_df['label'].tolist()

X_train, X_val, y_train, y_val = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42, stratify=train_labels
)

print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
# ==================================================================================

# ==================================================================================
# Load AfroXLMR tokenizer and model
afroxlmr_tokenizer = AutoTokenizer.from_pretrained(afroxlmr_model)
afroxlmr_classifier = AutoModelForSequenceClassification.from_pretrained(
    afroxlmr_model,
    num_labels=3,  # negative, neutral, positive
    id2label=label_to_sentiment,
    label2id=sentiment_to_label
)
# ==================================================================================

# Create datasets
train_dataset = SentimentDataset(X_train, y_train, afroxlmr_tokenizer)
val_dataset = SentimentDataset(X_val, y_val, afroxlmr_tokenizer)


Training samples: 560
Validation samples: 140


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# ================================
# Train AfroXLMR Model
# ================================

# Define metrics computation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training arguments
# Note: Using updated parameter names for newer transformers versions
training_args = TrainingArguments(
    output_dir='./results_afroxlmr',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

# Create Trainer
trainer_afroxlmr = Trainer(
    model=afroxlmr_classifier,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

print("Starting AfroXLMR training...")

# Train the model
trainer_afroxlmr.train()

print("\nAfroXLMR training completed!")

# Evaluate
eval_results = trainer_afroxlmr.evaluate()
print(f"\nAfroXLMR Validation Results:")
for key, value in eval_results.items():
    print(f"   {key}: {value:.4f}")

Starting AfroXLMR training...




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.23718,0.95,0.925641,0.9025,0.95
2,0.592900,0.225907,0.95,0.925641,0.9025,0.95
3,0.228200,0.235247,0.95,0.925641,0.9025,0.95


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



AfroXLMR training completed!





AfroXLMR Validation Results:
   eval_loss: 0.2372
   eval_accuracy: 0.9500
   eval_f1: 0.9256
   eval_precision: 0.9025
   eval_recall: 0.9500
   eval_runtime: 2.0722
   eval_samples_per_second: 67.5610
   eval_steps_per_second: 4.3430
   epoch: 3.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [19]:
#BELOW I WILL VISUALISE AfroXLMR, the issue is the data is very biased mostly positive so the model will definetely reflect that:
#Positive (label 2): 8,599 examples (83.4%)
#Negative (label 0): 980 examples (9.5%)
#Neutral (label 1): 875 examples (8.5%)

#Maybe we use class weights, or use data augmentation to balance the classes


In [None]:
# ================================
# AfroXLMR Sentiment Classifier - XAI, Ensemble, Evaluation, Data Loading
# Adds methods required by the assignment rubric: attention extraction/visualization,
# ensemble-ready probability/logit outputs, per-language evaluation, and lexicon/corpus loaders.
# This cell defines the AfroXLMRSentimentClassifier class and an example usage block.
# ================================
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
import os
import warnings
from typing import List, Tuple, Dict, Any, Optional
import torch.nn.functional as F

warnings.filterwarnings('ignore')

class AfroXLMRSentimentClassifier:
    """A wrapper around a HuggingFace AfroXLMR sequence classification model with:
    - XAI methods (attention extraction / visualization / important token analysis)
    - Ensemble-ready outputs (probabilities / logits)
    - Detailed evaluation utilities (confusion matrix, per-language metrics)
    - Static data loading helpers (lexicon & corpus)

    Parameters
    ----------
    model_name: str
        HuggingFace model id to load (default: Davlan/afro-xlmr-base)
    device: Optional[torch.device]
        Torch device (auto-detected by default)
    """

    def __init__(self, model_name: str = 'Davlan/afro-xlmr-base', device: Optional[torch.device] = None):
        # Device setup
        self.device = device if device is not None else (torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))
        self.model_name = model_name

        # Load tokenizer and model; enable attentions and hidden states for XAI
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        # Ensure the model outputs attentions & hidden states (rubric requirement)
        self.model.config.output_attentions = True
        self.model.config.output_hidden_states = True
        self.model.to(self.device)

        # Try to get id2label mapping from model config; if missing provide default mapping
        try:
            self.id2label = {int(k): v for k, v in self.model.config.id2label.items()}
        except Exception:
            self.id2label = {0: 'negative', 1: 'neutral', 2: 'positive'}

        self.label2id = {v: k for k, v in self.id2label.items()}

    def _tokenize(self, texts: List[str], max_length: int = 128):
        """Tokenize a list of texts and move tensors to the model device.

        Returns the tokenized batch (dict of tensors).
        """
        encoded = self.tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt')
        return {k: v.to(self.device) for k, v in encoded.items()}

    def predict(self, texts: List[str]) -> List[Dict[str, Any]]:
        """Predict sentiment labels (readable) and confidence for a list of texts.

        Returns a list of dicts: {text, sentiment, label, confidence, scores} where scores is a dict of class probabilities.
        """
        if not isinstance(texts, list):
            texts = [texts]
        batch = self._tokenize(texts)
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(**batch)
            logits = outputs.logits
            probs = F.softmax(logits, dim=-1).cpu().numpy()

        results = []
        for i, text in enumerate(texts):
            pred_label = int(np.argmax(probs[i]))
            pred_sentiment = self.id2label.get(pred_label, str(pred_label))
            scores = {self.id2label.get(idx, str(idx)): float(probs[i, idx]) for idx in range(probs.shape[1])}
            results.append({
                'text': text,
                'sentiment': pred_sentiment,
                'label': pred_label,
                'confidence': float(probs[i, pred_label]),
                'scores': scores
            })
        return results

    def predict_with_probabilities(self, texts: List[str]) -> np.ndarray:
        """Return class probabilities for an input list of texts.

        Returns a numpy array shape (n_texts, n_classes).
        Useful for ensemble combination.
        """
        if not isinstance(texts, list):
            texts = [texts]
        batch = self._tokenize(texts)
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(**batch)
            logits = outputs.logits.cpu()
            probs = F.softmax(logits, dim=-1).numpy()
        return probs

    def get_logits(self, texts: List[str]) -> np.ndarray:
        """Return raw logits for the provided texts (ensemble use).

        Returns numpy array shape (n_texts, n_classes).
        """
        if not isinstance(texts, list):
            texts = [texts]
        batch = self._tokenize(texts)
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(**batch)
            logits = outputs.logits.cpu().numpy()
        return logits

    def get_attention_weights(self, text: str) -> Dict[str, Any]:
        """Tokenize input text, run the model with attentions enabled and return token list,
        attention tensors, prediction and sentiment string.

        Returns dict: { 'tokens', 'attentions' (list of numpy arrays per layer), 'prediction', 'sentiment' }
        Handles device transfers and single-text input.
        """
        if not isinstance(text, str):
            raise ValueError('text must be a single string')
        encoded = self.tokenizer([text], return_tensors='pt', truncation=True, padding=True).to(self.device)
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(**encoded)

        # Extract attentions: tuple(num_layers) of tensors (batch, heads, seq, seq)
        attentions = outputs.attentions
        # Convert to cpu numpy for downstream use
        attentions_np = [att.cpu().numpy() for att in attentions] if attentions is not None else []

        # Tokens for display
        input_ids = encoded['input_ids'].cpu().numpy()[0]
        tokens = self.tokenizer.convert_ids_to_tokens(input_ids)

-1
,

## 1. Import and Setup
This section prepares the environment, loads required libraries, and instantiates the `AfroXLMRSentimentClassifier`.
We will also ensure the VADER lexicon is available for the lexicon-based sentiment components.

In [None]:
# Import additional libs used in the new sections
import os
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

# Instantiate classifier (uses the class defined earlier in the notebook)
try:
    clf = AfroXLMRSentimentClassifier()
    print('AfroXLMRSentimentClassifier instantiated on device:', clf.device)
except Exception as e:
    print('Could not instantiate classifier (missing packages or offline). Error:', e)

# Define default paths (adjust to your environment)
LEXICON_PATH = 'expanded_lexicon.csv'
CORPUS_PATH = 'shonasenti.csv'  # Example corpus file
OUTPUT_DIR = './afroxlmr_trained'
os.makedirs(OUTPUT_DIR, exist_ok=True)

## 2. Load Expanded Lexicon Data
Load multilingual lexicon entries (zulu, xhosa, sepedi, shona, afrikaans, english).
The `load_from_lexicon` static method will return parallel lists of texts, labels and language tags.

In [None]:
# Load lexicon using the classifier helper (falls back to sample data if file missing)
if 'clf' in globals() and hasattr(clf, 'load_from_lexicon') and os.path.exists(LEXICON_PATH):
    lex_texts, lex_labels, lex_langs = clf.load_from_lexicon(LEXICON_PATH, target_languages=['zulu','xhosa','sepedi','shona','afrikaans','english'])
    print(f'Loaded lexicon samples: {len(lex_texts)}')
else:
    print('Lexicon file not found or classifier missing. Using demo lexicon entries.')
    lex_texts = ['Ngiyajabula namhlanje', 'Ndiri kufara nekuti bhuku iri rakanaka', 'Ke a nyorilwe', 'Dit is baie sleg']
    lex_labels = [2, 2, 0, 0]
    lex_langs = ['zulu','shona','sepedi','afrikaans']

# Quick sample preview
for t, l, lg in zip(lex_texts[:5], lex_labels[:5], lex_langs[:5]):
    print(f'[{lg}] {t} -> {l}')

## 3. Load Corpus Data
Load an optional corpus (e.g., ShonaSenti) via `load_from_corpus`. This is useful to provide sentence-level data for fine-tuning.

In [None]:
# Load corpus if available (fall back to empty lists)
if 'clf' in globals() and hasattr(clf, 'load_from_corpus') and os.path.exists(CORPUS_PATH):
    corp_texts, corp_labels, corp_langs = clf.load_from_corpus(CORPUS_PATH, text_column='text', label_column='sentiment', language_column='language')
    print(f'Loaded corpus samples: {len(corp_texts)}')
else:
    print('Corpus file not found or classifier missing. Using empty corpus.')
    corp_texts, corp_labels, corp_langs = [], [], []

## 4. Data Preprocessing and Combination
Combine lexicon and corpus data, deduplicate, and inspect distributions.

In [None]:
# Combine datasets
if 'clf' in globals() and hasattr(clf, 'combine_datasets') and len(lex_texts) > 0:
    combo_texts, combo_labels, combo_langs = clf.combine_datasets((lex_texts, lex_labels, lex_langs), (corp_texts, corp_labels, corp_langs))
else:
    combo_texts, combo_labels, combo_langs = lex_texts, lex_labels, lex_langs

print(f'Total combined samples: {len(combo_texts)}')
from collections import Counter
print('Label distribution:', Counter(combo_labels))
print('Top languages:', Counter(combo_langs).most_common(10))

## 5. Model Training
Fine-tune AfroXLMR on the combined multilingual dataset. The `train()` method in the class will handle training and logging.
Note: training a transformer requires significant compute and time; the example below demonstrates the call and will run for a reduced number of epochs for demo purposes.

In [None]:
# Run training (demo). Increase num_epochs and batch_size for real runs.
try:
    if len(combo_texts) < 2:
        raise ValueError('Not enough data to train; provide more samples or load a corpus.')
    train_res = clf.train(combo_texts, combo_labels, num_epochs=1, batch_size=8, languages=combo_langs, output_dir=OUTPUT_DIR)
    print('Training returned:', train_res)
    # Save model+tokenizer for later use
    try:
        clf.model.save_pretrained(OUTPUT_DIR)
        clf.tokenizer.save_pretrained(OUTPUT_DIR)
        print('Saved model and tokenizer to', OUTPUT_DIR)
    except Exception as e:
        print('Could not save model locally:', e)
except Exception as e:
    print('Training skipped or failed (demo environment):', e)

## 6. Model Evaluation (overall and per-language)
Use the classifier evaluation helpers to compute confusion matrices, classification reports, and per-language metrics.

In [None]:
# Prepare a test split (demo). For a real experiment, use a held-out corpus test set.
from sklearn.model_selection import train_test_split
if len(combo_texts) >= 4:
    train_texts, test_texts, train_labels, test_labels, train_langs, test_langs = train_test_split(combo_texts, combo_labels, combo_langs, test_size=0.2, random_state=42, stratify=combo_labels if len(set(combo_labels))>1 else None)
else:
    test_texts, test_labels, test_langs = combo_texts, combo_labels, combo_langs

# Detailed evaluation (confusion matrix + report)
try:
    eval_res = clf.evaluate_detailed(test_texts, test_labels, save_confusion_matrix=True)
    print('Evaluation metrics:', {k: eval_res[k] for k in ['accuracy','precision','recall','f1']})
except Exception as e:
    print('Detailed evaluation failed:', e)

# Per-language evaluation
try:
    per_lang = clf.evaluate_per_language(test_texts, test_labels, test_langs)
    print('Per-language results keys:', list(per_lang.keys()))
except Exception as e:
    print('Per-language evaluation failed:', e)

## 7. XAI Analysis and Visualization
Generate attention visualizations for sample texts across different languages and extract top important words per prediction.

In [None]:
# Sample XAI demonstrations (saves images with dpi=300)
xai_samples = [
    ('Ngiyajabula namhlanje','zulu'),
    ('Ndiri kufara nekuti bhuku iri rakanaka','shona'),
    ('Ke a nyorilwe','sepedi'),
    ('Dit is baie sleg','afrikaans')
]
for i, (text, lang) in enumerate(xai_samples):
    try:
        info = clf.analyze_important_words(text, top_k=3)
        print(f'XAI sample {i+1} ({lang}) predicted: {info[sentiment]} - top words:', info['top_words'])
        img_path = f'afroxlmr_attention_{lang}_{i+1}.png'
        try:
            clf.visualize_attention(text, layer=-1, head=0, save_path=img_path)
            print('Saved attention visualization to', img_path)
        except Exception as e:
            print('Could not render attention heatmap for sample:', e)
    except Exception as e:
        print('XAI step failed for sample:', e)

## 8. Aspect-Based Sentiment Analysis (ABSA) Demo
A small demonstration that extracts sentiments for specific aspects/keywords within a text using sentence splitting and focused predictions.

In [None]:
def analyze_aspects(text: str, aspects: List[str]):
    """Return sentiment for individual aspect mentions in the text.

    This simple demo looks for aspect keywords and predicts sentiment on the full text (you can refine to span-level).
    """
    found = []
    for aspect in aspects:
        if aspect.lower() in text.lower():
            pred = clf.predict([text])[0]
            found.append({'aspect': aspect, 'sentiment': pred['sentiment'], 'confidence': pred['confidence']})
    return found

# Demo aspects and texts
demo_texts = [
    ('Ngiyathokoza ngomsebenzi wezikole, kodwa izindawo zokuhlala zimele ukuthuthukiswa', ['work','housing']),
    ('Bhuku rakanaka asi mutengo wakaipisisa', ['book','price'])
]
for txt, aspects in demo_texts:
    try:
        res = analyze_aspects(txt, aspects)
        print('Text:', txt)
        print('Aspect analysis:', res)
    except Exception as e:
        print('ABSA demo failed:', e)

## 9. Ensemble Output Preparation
Extract probabilities and logits in a format ready to be combined with AfriBERTa or other model outputs.

In [None]:
# Example: prepare ensemble-ready CSV for a small set of texts
ensemble_texts = ['Ek is baie bly vandag', 'Le mosebetsi ke mobe']
try:
    probs = clf.predict_with_probabilities(ensemble_texts)
    logits = clf.get_logits(ensemble_texts)
    import numpy as np
    df_ens = pd.DataFrame(probs, columns=[clf.id2label[i] for i in range(probs.shape[1])])
    df_ens['text'] = ensemble_texts
    df_ens['logit_0'] = logits[:,0]
    df_ens['logit_1'] = logits[:,1]
    df_ens['logit_2'] = logits[:,2]
    ensemble_out_path = os.path.join(OUTPUT_DIR, 'afroxlmr_ensemble_ready.csv')
    df_ens.to_csv(ensemble_out_path, index=False)
    print('Saved ensemble-ready outputs to', ensemble_out_path)
except Exception as e:
    print('Ensemble preparation skipped/failed:', e)

## 10. Results Summary
Summarize training and evaluation results, and point to saved artifacts (model, confusion matrix images, attention visualizations, and ensemble CSV).

In [None]:
print('=== RESULTS SUMMARY ===')
print('Model directory:', OUTPUT_DIR)
print('Saved confusion matrix: afroxlmr_confusion_matrix.png (if evaluation ran)')
print('Saved attention images: afroxlmr_attention_*.png (if XAI ran)')
print('Saved ensemble CSV:', os.path.join(OUTPUT_DIR, 'afroxlmr_ensemble_ready.csv'))
print('Demo complete. For production runs increase data size, epochs, and batch size; run on a GPU-enabled environment.')