In [1]:
import numpy as np 
import pandas as pd 
import os
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm.notebook import tqdm  
import matplotlib.pyplot as plt
from sklearn.metrics import (
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_curve,
    auc,
    RocCurveDisplay,
    PrecisionRecallDisplay,
)
from sklearn.preprocessing import label_binarize
import seaborn as sns
from sklearn.metrics import classification_report
import re
import unicodedata
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk


# Download VADER lexicon
nltk.download('vader_lexicon')


data_dir = r"C:\Users\User\Desktop\Assignment 3 Resources"
for dirname, _, filenames in os.walk(data_dir):
    for filename in filenames:
        print(os.path.join(dirname, filename))

file_path = os.path.join(data_dir, 'corrected_lexicon.xlsx')
print('Using file_path:', file_path)


ModuleNotFoundError: No module named 'nltk'

In [None]:
# Local path for corrected lexicon
file_path = r"C:\Users\User\Desktop\Assignment 3 Resources\corrected_lexicon.xlsx"

# Load the dataset (guard against missing file)
if os.path.exists(file_path):
    df1 = pd.read_excel(file_path)
    print(f"Loaded {file_path} with shape {df1.shape}")
else:
    raise FileNotFoundError(f"Expected lexicon file not found at: {file_path}")

df1.head()


Loaded C:\Users\User\Desktop\Assignment 3 Resources\corrected_lexicon.xlsx with shape (3234, 9)


Unnamed: 0,CILUBA,FRANCAIS,ENGLISH,AFRIKAANS,ZULU,Sepedi,SCORE,SENTIMENT,NATURE
0,umue,un,a,N,I,,0,Neutre,nombre
1,Biabunyi,beaucoup,a lot,baie,okuningi,ga ntši,3,Positif,mot
2,bungi,beaucoup,a lot,baie,okuningi,kudu,0,Neutre,adverbe
3,dilekela,abandon,abandonment,Verlating,Ukulahlwa,hlokomologa,4,Positif,mot
4,Kulekela,abandon,abandonment,verlating,ukulahlwa,hlokomologa,3,Positif,mot


In [None]:
# Rename columns
df1.columns = ['ciluba', 'french', 'english', 'afrikaans', 'zulu', 'sepedi', 'score', 'sentiment', 'nature']

# Drop duplicate rows based on language columns + sentiment (or all columns if you prefer)
df = df1.drop_duplicates(subset=['ciluba', 'french', 'english', 'afrikaans', 'zulu', 'sepedi', 'sentiment'])

# Drop rows with missing values in the language columns + sentiment
df = df1.dropna(subset=['ciluba', 'french', 'english', 'afrikaans', 'zulu', 'sepedi', 'sentiment'])

# Now you can proceed to vectorize, reduce and plot based on these language columns as separate features or however you want.
# For example, you could vectorize each language column separately or combine them as needed.
df



Unnamed: 0,ciluba,french,english,afrikaans,zulu,sepedi,score,sentiment,nature
1,Biabunyi,beaucoup,a lot,baie,okuningi,ga ntši,3,Positif,mot
2,bungi,beaucoup,a lot,baie,okuningi,kudu,0,Neutre,adverbe
3,dilekela,abandon,abandonment,Verlating,Ukulahlwa,hlokomologa,4,Positif,mot
4,Kulekela,abandon,abandonment,verlating,ukulahlwa,hlokomologa,3,Positif,mot
5,kulekela,abandon,abandonment,Verlating,Ukulahlwa,hlokomologa,4,Positif,mot
...,...,...,...,...,...,...,...,...,...
3200,Ya Bunda,construire,build,Bou,Yakha,aga,5,Positive,verbe
3201,Ya ku leta,faire venir,bring,Bring,Letha,tliša,4,Positive,verbe
3202,Yamba,prendre,take,Neem,Thatha,tšeya,3,Positive,verbe
3205,Yeleka,espérer,hope,Hoop,Themba,tshepo,9,Positive,verbe


In [None]:

# ================================
# STEP 1: Load Data
# ================================
# Local files in data_dir set earlier in cell 1 
lexicon_path = os.path.join(r"C:\Users\User\Desktop\Assignment 3 Resources", 'corrected_lexicon.xlsx')
french_test_path = os.path.join(r"C:\Users\User\Desktop\Assignment 3 Resources", 'french_test_corpus.xlsx')

df = pd.read_excel(lexicon_path)
test_corpus_df = pd.read_excel(french_test_path)

# ================================
# STEP 2: Preprocess Lexicon
# ================================
# Rename columns
df.columns = ['ciluba', 'french', 'english', 'afrikaans', 'zulu', 'sepedi', 'score', 'sentiment', 'nature']

# Define languages
supported_languages = ['french', 'afrikaans', 'zulu', 'ciluba', 'sepedi', 'english']

# ================================
# STEP 3: Translation Dictionary
# ================================
def clean_text_for_matching(text):
    text = re.sub(r'[^\w\s]', '', str(text))
    return text.lower()

def create_translation_dicts(df, languages):
    translation_dicts = {src: {tgt: {} for tgt in languages if tgt != src} for src in languages}
    for _, row in df.iterrows():
        for src in languages:
            source_phrase = str(row[src]).strip().lower()
            if pd.isna(source_phrase) or source_phrase == '':
                continue
            for tgt in languages:
                if tgt == src:
                    continue
                target_phrase = str(row[tgt]).strip().lower()
                if pd.isna(target_phrase) or target_phrase == '':
                    continue
                translation_dicts[src][tgt][source_phrase] = target_phrase
    return translation_dicts

translation_dicts = create_translation_dicts(df, supported_languages)
print("✅ Translation dictionaries created.")

# ================================
# STEP 4: Sentiment Scores
# ================================
sentiment_averages = {}
all_sentiments = {}

for lang in supported_languages:
    sentiment_averages[lang] = df.groupby(lang)['score'].mean().to_dict()
    all_sentiments[lang] = df.groupby(lang)['score'].apply(list).to_dict()

print("✅ Sentiment scores calculated.")

# ================================
# STEP 5: Custom Sentiment Function
# ================================
def compute_sentiment_v2(scores):
    if len(scores) == 1:
        return scores[0]
    elif len(scores) == 2:
        return max(scores, key=abs)
    else:
        pos = [s for s in scores if s > 0]
        neg = [s for s in scores if s < 0]
        if len(pos) >= len(neg):
            return sum(pos) / len(pos) if pos else 0
        else:
            return sum(neg) / len(neg) if neg else 0

# ================================
# STEP 6: Translate & Analyze Function
# ================================
def translate_analyze_sentiments_with_vader(text, source_lang, target_lang,
                                            translation_dicts, sentiment_averages, all_sentiments,
                                            vader_analyzer):
    source_lang = str(source_lang).lower()
    target_lang = str(target_lang).lower()

    if source_lang not in translation_dicts or target_lang not in translation_dicts[source_lang]:
        return {k: '' if isinstance(v, str) else 0 for k, v in {
            "translated_text": "",
            "total_score_avg": 0,
            "word_scores_avg": "",
            "sentiment_avg": "neutral",
            "total_score_v2": 0,
            "word_scores_v2": "",
            "sentiment_v2": "neutral",
            "vader_positive": 0,
            "vader_negative": 0,
            "vader_neutral": 0,
            "vader_compound": 0,
            "vader_sentiment": "neutral"
        }.items()}

    cleaned_text = clean_text_for_matching(text)
    words = cleaned_text.split()
    translated_sentence = []
    total_score_avg = 0
    total_score_v2 = 0
    word_scores_avg = []
    word_scores_v2 = []

    i = 0
    while i < len(words):
        matched_phrase = None
        translated_phrase = None
        phrase_score_avg = 0
        phrase_scores_v2 = 0
        max_length = min(5, len(words) - i)

        for j in range(max_length, 0, -1):
            phrase = ' '.join(words[i:i+j])
            if phrase in translation_dicts[source_lang][target_lang]:
                matched_phrase = phrase
                translated_phrase = translation_dicts[source_lang][target_lang][phrase]
                phrase_score_avg = sentiment_averages[source_lang].get(phrase, 0)
                phrase_scores = all_sentiments[source_lang].get(phrase, [])
                phrase_scores_v2 = compute_sentiment_v2(phrase_scores)
                i += j
                break

        if matched_phrase:
            translated_sentence.append(translated_phrase)
            total_score_avg += phrase_score_avg
            total_score_v2 += phrase_scores_v2
            word_scores_avg.append(f"{matched_phrase}:{phrase_score_avg}")
            word_scores_v2.append(f"{matched_phrase}:{phrase_scores_v2}")
        else:
            word = words[i]
            translated_word = translation_dicts[source_lang][target_lang].get(word, word)
            translated_sentence.append(translated_word)
            score_avg = sentiment_averages[source_lang].get(word, 0)
            scores = all_sentiments[source_lang].get(word, [])
            score_v2 = compute_sentiment_v2(scores)
            total_score_avg += score_avg
            total_score_v2 += score_v2
            word_scores_avg.append(f"{word}:{score_avg}")
            word_scores_v2.append(f"{word}:{score_v2}")
            i += 1

    translated_text = ' '.join(translated_sentence).strip()
    sentiment_avg = "positive" if total_score_avg > 0.05 else "negative" if total_score_avg < -0.05 else "neutral"
    sentiment_v2 = "positive" if total_score_v2 > 0.05 else "negative" if total_score_v2 < -0.05 else "neutral"

    vader_scores = vader_analyzer.polarity_scores(text)
    vader_sentiment = "positive" if vader_scores['compound'] >= 0.05 else "negative" if vader_scores['compound'] <= -0.05 else "neutral"

    return {
        "translated_text": translated_text,
        "total_score_avg": total_score_avg,
        "word_scores_avg": '; '.join(word_scores_avg),
        "sentiment_avg": sentiment_avg,
        "total_score_v2": total_score_v2,
        "word_scores_v2": '; '.join(word_scores_v2),
        "sentiment_v2": sentiment_v2,
        "vader_positive": vader_scores['pos'],
        "vader_negative": vader_scores['neg'],
        "vader_neutral": vader_scores['neu'],
        "vader_compound": vader_scores['compound'],
        "vader_sentiment": vader_sentiment
    }

# ================================
# STEP 7: Apply to Test Corpus
# ================================
def safe_translate_and_analyze_sentiments_with_vader(row, translation_dicts, sentiment_averages, all_sentiments, vader_analyzer):
    try:
        return pd.Series(translate_analyze_sentiments_with_vader(
            row.get('sentence', ''),
            row.get('source_language', ''),
            row.get('target_language', ''),
            translation_dicts,
            sentiment_averages,
            all_sentiments,
            vader_analyzer
        ))
    except Exception as e:
        print(f"Error in row {row.name}: {e}")
        return pd.Series({
            "translated_text": "",
            "total_score_avg": 0,
            "word_scores_avg": "",
            "sentiment_avg": "neutral",
            "total_score_v2": 0,
            "word_scores_v2": "",
            "sentiment_v2": "neutral",
            "vader_positive": 0,
            "vader_negative": 0,
            "vader_neutral": 0,
            "vader_compound": 0,
            "vader_sentiment": "neutral"
        })

# Run the sentiment analysis
vader_analyzer = SentimentIntensityAnalyzer()
# Ensure we have the expected columns in test_corpus_df
print('Test corpus columns:', test_corpus_df.columns.tolist())
if 'sentence' not in test_corpus_df.columns:
    raise KeyError("Expected 'sentence' column in test corpus. Found: " + ','.join(test_corpus_df.columns.astype(str)))

# Apply translation+sentiment
test_corpus_df = test_corpus_df.copy()

# For speed in this environment, process a sample (or full if small)
sample_size = min(len(test_corpus_df), 500)
print(f"Processing {sample_size} rows (of {len(test_corpus_df)})")

test_corpus_df_sample = test_corpus_df.iloc[:sample_size].copy()

results = test_corpus_df_sample.apply(
    lambda row: safe_translate_and_analyze_sentiments_with_vader(
        row,
        translation_dicts,
        sentiment_averages,
        all_sentiments,
        vader_analyzer
    ), axis=1
)

# Join results back
test_corpus_df = test_corpus_df_sample.join(results)

print("✅ Sentiment analysis applied to test corpus (sample).")

# ================================
# STEP 8: Optional Evaluation
# ================================
# Map sentiment strings to numeric
sentiment_mapping = {'negative': -1, 'neutral': 0, 'positive': 1}
if 'sentiment_v2' in test_corpus_df.columns:
    test_corpus_df['custom_sentiment_numeric'] = test_corpus_df['sentiment_v2'].map(sentiment_mapping)
if 'vader_sentiment' in test_corpus_df.columns:
    test_corpus_df['vader_sentiment_numeric'] = test_corpus_df['vader_sentiment'].map(sentiment_mapping)

# ================================
# STEP 9: Display Results
# ================================
print("📊 Preview of sentiment analysis results:")
display(test_corpus_df.head(20))


✅ Translation dictionaries created.
✅ Sentiment scores calculated.
Test corpus columns: ['source_language', 'target_language', 'sentence']
Processing 500 rows (of 2999)
✅ Sentiment analysis applied to test corpus (sample).
📊 Preview of sentiment analysis results:


Unnamed: 0,source_language,target_language,sentence,translated_text,total_score_avg,word_scores_avg,sentiment_avg,total_score_v2,word_scores_v2,sentiment_v2,vader_positive,vader_negative,vader_neutral,vader_compound,vader_sentiment,custom_sentiment_numeric,vader_sentiment_numeric
0,french,english,Arrange pagne proteger Comportement Seulement,arrange loincloth protect behavior only,10.6,arrange:1.0; pagne:2.0; proteger:3.0; comporte...,positive,11.6,arrange:1; pagne:3; proteger:3; comportement:1...,positive,0.0,0.0,1.0,0.0,neutral,1,0
1,french,ciluba,Rearrange mordre purifier Vérité bourse,akajilula kusuma kutokesha bulelela tshibombu,8.566667,rearrange:1.0; mordre:-2.0; purifier:3.6666666...,positive,9.066667,rearrange:1; mordre:-2; purifier:3.66666666666...,positive,0.0,0.0,1.0,0.0,neutral,1,0
2,french,afrikaans,Parle aisé Serpent Mère Abhorrer,praat maklik slang moeder verafsku,9.0,parle:2.0; aisé:3.0; serpent:-2.25; mère:2.25;...,positive,7.25,parle:2; aisé:3; serpent:-4.0; mère:2.25; abho...,positive,0.0,0.0,1.0,0.0,neutral,1,0
3,french,zulu,Parler à nouveau murmurer chanter Sein castrer,khuluma futhi nyenyeza cula isibele xholosa,16.642857,parler à nouveau:2.0; murmurer:4.0; chanter:3....,positive,16.642857,parler à nouveau:2; murmurer:4; chanter:3.1428...,positive,0.0,0.0,1.0,0.0,neutral,1,0
4,french,english,Remet Déchirure Infidèle Étrangler Kubela,put back tear unfaithful strangle kubela,-1.8,remet:3.2; déchirure:-4.0; infidèle:-5.0; étra...,negative,-1.8,remet:3.2; déchirure:-4; infidèle:-5; étrangle...,negative,0.0,0.0,1.0,0.0,neutral,-1,0
5,french,ciluba,Dis Fétiche finir Rêver corps,amba manga tshinda kulota mubidimbidi,6.714286,dis:3.0; fétiche:-3.0; finir:1.333333333333333...,positive,9.380952,dis:3; fétiche:-3.0; finir:4.0; rêver:2.666666...,positive,0.0,0.0,1.0,0.0,neutral,1,0
6,french,afrikaans,Superposer Preparer bruit cérémoniecoutumière ...,superponeer voorberei geraas gebruiklike serem...,13.266667,superposer:2.2; preparer:3.4; bruit:2.66666666...,positive,13.266667,superposer:2.2; preparer:3.4; bruit:2.66666666...,positive,0.0,0.0,1.0,0.0,neutral,1,0
7,french,zulu,Ramasse parfaite voler cueillire trente-cinq,phakamisa gweda ndiza ukhethiwe trentecinq,3.8,ramasse:4.0; parfaite:3.0; voler:-4.2; cueilli...,positive,4.8,ramasse:4; parfaite:4; voler:-4.2; cueillire:1...,positive,0.0,0.0,1.0,0.0,neutral,1,0
8,french,english,Dépêche maladie chapeau rouler repasser,dispatch illness hat to roll go back,9.55,dépêche:4.0; maladie:-0.7; chapeau:2.75; roule...,positive,8.25,dépêche:4; maladie:-2.5; chapeau:2.75; rouler:...,positive,0.0,0.0,1.0,0.0,neutral,1,0
9,french,ciluba,Répète Galère Expliquer trasformer voler,ambulula dikenga kuvuija kukudimuna kuiba,0.183333,répète:2.8; galère:-3.75; expliquer:4.33333333...,positive,0.183333,répète:2.8; galère:-3.75; expliquer:4.33333333...,positive,0.0,0.0,1.0,0.0,neutral,1,0


In [None]:
# Save the processed test corpus (current `test_corpus_df` in notebook)
import os
out_path = os.path.join(data_dir, 'test_corpus_processed.csv')
# Ensure DataFrame exists
if 'test_corpus_df' in globals():
    try:
        test_corpus_df.to_csv(out_path, index=False, encoding='utf-8')
        print(f"Saved processed test_corpus_df to: {out_path}")
    except Exception as e:
        print('Error saving CSV:', e)
else:
    print('test_corpus_df not found in the notebook namespace.')


Saved processed test_corpus_df to: C:\Users\User\Desktop\Assignment 3 Resources\test_corpus_processed.csv


In [None]:
# ================================
# STEP 1: Install Required Libraries (SIMPLIFIED)
# ================================
# Run this cell AFTER the NumPy fix cell above and kernel restart

import subprocess
import sys

print("📦 Installing transformer packages...")
print("=" * 60)

# Install packages one by one with proper error handling
packages = [
    ('transformers', 'transformers'),
    ('torch', 'torch --index-url https://download.pytorch.org/whl/cpu'),
    ('sentencepiece', 'sentencepiece'),
]

for name, install_cmd in packages:
    print(f"\n📥 Installing {name}...")
    try:
        result = subprocess.run(
            [sys.executable, '-m', 'pip', 'install'] + install_cmd.split(),
            capture_output=True,
            text=True,
            timeout=300
        )
        if result.returncode == 0:
            print(f"   ✅ {name} installed")
        else:
            print(f"   ⚠️ {name} may have issues: {result.stderr[:200]}")
    except Exception as e:
        print(f"   ❌ Error with {name}: {str(e)[:200]}")

print("\n" + "=" * 60)
print("✅ Installation complete!")
print("\n⚠️ IMPORTANT: Restart the kernel now!")
print("   Go to: Kernel → Restart Kernel")

📦 Installing transformer packages...

📥 Installing transformers...
   ✅ transformers installed

📥 Installing torch...
   ✅ torch installed

📥 Installing sentencepiece...
   ✅ sentencepiece installed

✅ Installation complete!

⚠️ IMPORTANT: Restart the kernel now!
   Go to: Kernel → Restart Kernel


In [None]:
# ================================
# STEP 1b: Verify Installation (Run this after kernel restart)
# ================================
# Run this cell to check if packages are properly installed

import sys

def check_package(package_name):
    try:
        __import__(package_name)
        return True
    except ImportError:
        return False

packages_to_check = {
    'torch': 'PyTorch',
    'transformers': 'Transformers',
    'sentencepiece': 'SentencePiece',
}

print("Checking installed packages...")
print("=" * 50)
all_installed = True

for pkg, name in packages_to_check.items():
    if check_package(pkg):
        print(f"✅ {name} ({pkg}): Installed")
    else:
        print(f"❌ {name} ({pkg}): NOT installed")
        all_installed = False

print("=" * 50)
if all_installed:
    print("\n✅ All packages are installed! You can proceed.")
else:
    print("\n❌ Some packages are missing. Please:")
    print("   1. Run the installation cell above")
    print("   2. Restart the kernel")
    print("   3. Run this cell again")

Checking installed packages...
✅ PyTorch (torch): Installed


  from .autonotebook import tqdm as notebook_tqdm


✅ Transformers (transformers): Installed
✅ SentencePiece (sentencepiece): Installed

✅ All packages are installed! You can proceed.


In [None]:
# ================================
# EVERYTHING BELOW IS WITH REGARDS TO MODEL TRAINING 
# ================================

In [None]:
# ================================
# STEP 2: Load Transformer Models and Prepare Data
# ================================
# Run this cell AFTER restarting the kernel following package installation

import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Model names from HuggingFace
afroxlmr_model = "Davlan/afro-xlmr-base"  # AfroXLMR base model
afriberta_model = "castorini/afriberta_base"  # AfriBERTa base model

In [None]:
#CELLS BELOW IS CREATING THREE TYPES OF DATA:
#STORING LEXICON INDIVIDUAL WORDS
#CREATING SENTENCES FROM LESXICON WORDS
#USING CORPUS SENTENCES BOTH TRANSLATED AND ORIGINAL

In [None]:
# ================================
# THIS CELL IS PURELY FOR LEXICON INDIVIDUAL WORDS - WE DONT HAVE TO INCLUDE JUST IN CASE
# ================================
# We'll use the lexicon data to create labeled training examples
# The lexicon has sentiment labels we can use for fine-tuning


# Problem: Lexicon has mixed French/English labels ("Positif", "positive", "Negatif", etc.)
# Solution: Convert all variants to consistent English lowercase labels
def normalize_sentiment(sentiment):
    sentiment = str(sentiment).lower().strip()  # Convert to lowercase and remove spaces
    
    # Map all variants to standard labels
    if sentiment in ['positif', 'positive']:
        return 'positive'
    elif sentiment in ['negatif', 'negative']:
        return 'negative'
    elif sentiment in ['neutre', 'neutral']:
        return 'neutral'
    else:
        return None  # Invalid labels will be removed later

# Apply normalization to all sentiment labels
df['sentiment_normalized'] = df['sentiment'].apply(normalize_sentiment)

# Remove rows with unmapped/invalid sentiments
df_clean = df[df['sentiment_normalized'].notna()].copy()


# --------------------------------------------
# 2. EXTRACT WORDS FROM EACH LANGUAGE
# --------------------------------------------
# For each language column (French, English, Zulu, etc.), extract the word + sentiment
# This creates separate training examples for each language's words
# Example: "beaucoup" (French) → positive, "a lot" (English) → positive

train_data = []  # Will hold data from all languages

for lang in supported_languages:  # Loop through: french, afrikaans, zulu, ciluba, sepedi, english
    # Extract the language column and sentiment
    temp_df = df_clean[[lang, 'sentiment_normalized']].copy()
    temp_df.columns = ['text', 'sentiment']
    
    # Clean up the data
    temp_df = temp_df.dropna(subset=['text', 'sentiment'])  # Remove empty cells
    temp_df['text'] = temp_df['text'].astype(str)           # Ensure text is string
    temp_df = temp_df[temp_df['text'].str.strip() != '']    # Remove blank strings
    temp_df = temp_df[temp_df['text'].str.lower() != 'nan'] # Remove "nan" strings
    
    # Add this language's data to the list
    train_data.append(temp_df)

# --------------------------------------------
# 3. COMBINE ALL LANGUAGES
# --------------------------------------------
# Merge all language data into one big training set
# Result: bungi, beaucoup, a lot, baie, okuningi, kudu all become separate training examples
train_df = pd.concat(train_data, ignore_index=True)
# Remove duplicate words (if same word appears in multiple rows)
train_df = train_df.drop_duplicates(subset=['text'])

# --------------------------------------------
# 4. CONVERT SENTIMENT TO NUMBERS
# --------------------------------------------
# Transformers need numeric labels, not text
# negative → 0, neutral → 1, positive → 2
sentiment_to_label = {'negative': 0, 'neutral': 1, 'positive': 2}  # Text → Number
label_to_sentiment = {0: 'negative', 1: 'neutral', 2: 'positive'}  # Number → Text (for later)

train_df['label'] = train_df['sentiment'].map(sentiment_to_label)

# Remove any rows where mapping failed (shouldn't happen, but just in case)
train_df = train_df.dropna(subset=['label'])
train_df['label'] = train_df['label'].astype(int)  # Ensure label is integer

# --------------------------------------------
# 5. DISPLAY RESULTS
# --------------------------------------------
print(f"\n✅ Training data prepared:")
print(f"   Total examples: {len(train_df)}")
print(f"   Label distribution:\n{train_df['label'].value_counts()}")
print(f"\n   Sample examples:")
display(train_df.head(10))


✅ Training data prepared:
   Total examples: 10454
   Label distribution:
label
2    8599
0     980
1     875
Name: count, dtype: int64

   Sample examples:


Unnamed: 0,text,sentiment,label
0,un,neutral,1
1,beaucoup,positive,2
3,abandon,positive,2
6,abhorrer,positive,2
7,capacité,positive,2
8,abolir,positive,2
9,abolition,positive,2
10,abominable,positive,2
11,avorter,negative,0
12,absence,positive,2


In [None]:
# ================================
# CREATING SENTENCES USING LEXICON WORDS FOR MORE DATA - MAYBE LEXICON TEAM SHOULD DO THIS BUT JUST IN CASE ITS HERE
# ================================
# Convert isolated lexicon words into sentence contexts

print("🔄 Augmenting lexicon words into sentence templates...")

# Sentence templates for different languages
templates_by_lang = {
    'french': [
        "Je trouve que {} est important.",
        "C'est {}.",
        "Le mot {} exprime un sentiment.",
        "{} dans cette phrase."
    ],
    'english': [
        "I think {} is important.",
        "This is {}.",
        "The word {} expresses a feeling.",
        "{} in this sentence."
    ],
    'afrikaans': [
        "Dit is {}.",
        "Die woord {} is belangrik.",
        "{} in hierdie sin."
    ],
    'zulu': [
        "Lokhu {}.",
        "Igama {} libalulekile.",
        "{} kulesi sigaba."
    ],
    'ciluba': [
        "Ici {}.",
        "Ijambu {} lidipingana.",
        "{} mumpanzu."
    ],
    'sepedi': [
        "Se ke {}.",
        "Lentšu {} le bohlokwa.",
        "{} mo polelong ye."
    ]
}

# Create augmented dataset
aug_data = []

for _, row in df_clean.iterrows():
    sentiment = row['sentiment_normalized']
    if pd.isna(sentiment):
        continue
    
    # For each language, create template-based sentences
    for lang in supported_languages:
        word = str(row[lang]).strip()
        if not word or pd.isna(word) or word.lower() == 'nan':
            continue
        
        # Use language-specific templates if available, else use French templates
        templates = templates_by_lang.get(lang, templates_by_lang['french'])
        
        # Create 2-3 sentences per word (not all templates to avoid too much data)
        for template in templates[:2]:
            try:
                sentence = template.format(word)
                aug_data.append({
                    'text': sentence,
                    'sentiment': sentiment,
                    'source': 'lexicon_augmented',
                    'language': lang
                })
            except:
                continue

aug_df = pd.DataFrame(aug_data)
print(f"✅ Created {len(aug_df)} augmented sentences from lexicon")
print(f"   Distribution: {aug_df['sentiment'].value_counts().to_dict()}")

🔄 Augmenting lexicon words into sentence templates...
✅ Created 38008 augmented sentences from lexicon
   Distribution: {'positive': 32128, 'negative': 3440, 'neutral': 2440}


In [None]:
# ================================
# LOADING CORPUS DATA AND SPLITTING TRAIN/TEST WE ALSO INCLUDE TRANSLATED SENTENCES
# ================================

# Load the processed corpus
corpus_path = os.path.join(data_dir, 'test_corpus_processed.csv')
if os.path.exists(corpus_path):
    from sklearn.model_selection import train_test_split
    
    corpus_df = pd.read_csv(corpus_path)
    print(f"✅ Loaded corpus with {len(corpus_df)} sentences")
    
    # Extract French sentences with sentiment_v2 labels
    corpus_data = corpus_df[['sentence', 'sentiment_v2']].copy()
    corpus_data.columns = ['text', 'sentiment']
    corpus_data = corpus_data.dropna(subset=['text', 'sentiment'])
    
    # SPLIT: 70% train, 30% test (stratified by sentiment)
    corpus_train, corpus_test, train_indices, test_indices = train_test_split(
        corpus_data,
        corpus_data.index,  # Keep track of indices for translation lookup
        test_size=0.3,
        random_state=42,
        stratify=corpus_data['sentiment']
    )
    
    corpus_train['source'] = 'corpus'
    corpus_train['language'] = 'french'
    
    print(f"✅ Corpus split completed:")
    print(f"   Training: {len(corpus_train)} French sentences")
    print(f"   Testing: {len(corpus_test)} French sentences (held-out for evaluation)")
    
    # ========================================
    # ADD TRANSLATED SENTENCES
    # ========================================
    
    # Check if translated_text and target_language columns exist
    if 'translated_text' in corpus_df.columns and 'target_language' in corpus_df.columns:
        translated_train_data = []
        
        # For each training sample, get its translated version
        for idx in train_indices:
            row = corpus_df.loc[idx]
            
            # Check if translation exists and is not empty
            translated_text = str(row.get('translated_text', '')).strip()
            target_lang = str(row.get('target_language', '')).strip().lower()
            sentiment = row.get('sentiment_v2', '')
            
            if translated_text and translated_text != 'nan' and len(translated_text) > 0:
                # Only add if target language is different from French
                if target_lang and target_lang != 'french':
                    translated_train_data.append({
                        'text': translated_text,
                        'sentiment': sentiment,
                        'source': 'corpus_translated',
                        'language': target_lang
                    })
        
        # Add translated sentences to training data
        if translated_train_data:
            corpus_train_translated = pd.DataFrame(translated_train_data)
            corpus_train = pd.concat([corpus_train, corpus_train_translated], ignore_index=True)
            
            print(f"   ✅ Added {len(translated_train_data)} translated sentences")
            print(f"   Languages distribution:")
            lang_counts = corpus_train_translated['language'].value_counts()
            for lang, count in lang_counts.items():
                print(f"      - {lang}: {count} sentences")
            print(f"\n   📊 Total training sentences: {len(corpus_train)}")
            print(f"      - French (original): {(corpus_train['source'] == 'corpus').sum()}")
            print(f"      - Translated: {(corpus_train['source'] == 'corpus_translated').sum()}")
        else:
            print(f"   ⚠️ No valid translations found in corpus")
    else:
        print(f"   ℹ️ No 'translated_text' or 'target_language' columns found")
        print(f"   Available columns: {corpus_df.columns.tolist()}")
        print(f"   Continuing with French sentences only")
    
    print(f"\n   Train sentiment distribution: {corpus_train['sentiment'].value_counts().to_dict()}")
    print(f"   Test sentiment distribution: {corpus_test['sentiment'].value_counts().to_dict()}")
    
    # Save test split for later evaluation
    test_split_path = os.path.join(data_dir, 'corpus_test_split.csv')
    corpus_test.to_csv(test_split_path, index=False)
    print(f"\n   💾 Saved held-out test set to: corpus_test_split.csv")
else:
    print(f"⚠️ Corpus file not found at {corpus_path}")
    print("   Will use only lexicon data for training")
    corpus_train = pd.DataFrame(columns=['text', 'sentiment', 'source', 'language'])
    corpus_test = pd.DataFrame(columns=['text', 'sentiment'])



✅ Loaded corpus with 500 sentences
✅ Corpus split completed:
   Training: 350 French sentences
   Testing: 150 French sentences (held-out for evaluation)
   ✅ Added 350 translated sentences
   Languages distribution:
      - afrikaans: 96 sentences
      - english: 87 sentences
      - zulu: 84 sentences
      - ciluba: 83 sentences

   📊 Total training sentences: 700
      - French (original): 350
      - Translated: 350

   Train sentiment distribution: {'positive': 664, 'negative': 34, 'neutral': 2}
   Test sentiment distribution: {'positive': 142, 'negative': 7, 'neutral': 1}

   💾 Saved held-out test set to: corpus_test_split.csv


In [None]:
#CELLS ABOVE IS CREATING THREE TYPES OF DATA:
#STORING LEXICON INDIVIDUAL WORDS
#CREATING SENTENCES FROM LESXICON WORDS
#USING CORPUS SENTENCES BOTH TRANSLATED AND ORIGINAL

In [None]:
# ================================
# COMBINE ALL TRAINING DATA SOURCES
# ================================
#TRAINING DATA - If we want to include lexicon singular words
lexicon_original = train_df[['text', 'sentiment']].copy() # Individual words from lexicon and sentiment values: triste - negative
lexicon_original['source'] = 'lexicon_original'  # Adds column called source to see where it came from: triste - negative - lexicon_original
lexicon_original['language'] = 'mixed' # Add columns called language to indicate mixed languages

# Takes data from sources and combines them into one big dataframe for training
combined_train_df = pd.concat([
    # lexicon_original, COMMENTED OUT: unless we want model to have context of specific words
    # aug_df[['text', 'sentiment', 'source', 'language']], COMMENTED OUT: ~7,000 CREATED sentences from lexicon (uncomment to use)
    corpus_train[['text', 'sentiment', 'source', 'language']] # Uses Corpus sentences with sentiment values
], ignore_index=True)

# Remove duplicates
combined_train_df = combined_train_df.drop_duplicates(subset=['text'])

# For each word/sentence it looks at the sentiment value and maps it to a number
combined_train_df['label'] = combined_train_df['sentiment'].map(sentiment_to_label)
combined_train_df = combined_train_df.dropna(subset=['label'])
combined_train_df['label'] = combined_train_df['label'].astype(int)


#JUST SOME LOGGING TO SEE THE DATA THE MODELS WILL USE
print(f"\n✅ Combined training data created:")
print(f"   Total examples: {len(combined_train_df)}")
print(f"   - Lexicon original: {(combined_train_df['source'] == 'lexicon_original').sum()}")
print(f"   - Lexicon augmented: {(combined_train_df['source'] == 'lexicon_augmented').sum()}")
print(f"   - Corpus (French): {(combined_train_df['source'] == 'corpus').sum()}")
print(f"   - Corpus (Translated): {(combined_train_df['source'] == 'corpus_translated').sum()}")
print(f"\n   Language distribution:")
if 'language' in combined_train_df.columns:
    print(combined_train_df['language'].value_counts().to_dict())
print(f"\n   Label distribution:")
print(combined_train_df['label'].value_counts().sort_index())
print(f"\n   Sample from each source:")
for src in combined_train_df['source'].unique():
    sample = combined_train_df[combined_train_df['source'] == src].head(2)
    print(f"\n   {src}:")
    for _, row in sample.iterrows():
        lang_info = f" [{row.get('language', 'unknown')}]" if 'language' in row else ""
        print(f"      {row['text'][:60]}...{lang_info} → {row['sentiment']}")

# Update train_df to use combined data
train_df = combined_train_df[['text', 'label']].copy()
print(f"\n✅ Ready to train with {len(train_df)} examples!")


✅ Combined training data created:
   Total examples: 700
   - Lexicon original: 0
   - Lexicon augmented: 0
   - Corpus (French): 350
   - Corpus (Translated): 350

   Language distribution:
{'french': 350, 'afrikaans': 96, 'english': 87, 'zulu': 84, 'ciluba': 83}

   Label distribution:
label
0     34
1      2
2    664
Name: count, dtype: int64

   Sample from each source:

   corpus:
      coude lit Dehors galère captif... [french] → positive
      Courir Larme Doigt combo Poing... [french] → positive

   corpus_translated:
      lukenyibu bulalu kuya dikenga mupika... [ciluba] → positive
      gijima izinyembezi umunwe isivalo inqindi... [zulu] → positive

✅ Ready to train with 700 examples!


In [None]:
#BELOW WE FINE TUNE & TRAIN AfroXLMR USING THE COMBINED TRAINING DATA

In [None]:
# ================================
# FINE TUNE AfroXLMR - Getting everything ready to train AfroXLMR model
# ================================

# ==================================================================================
#Import classes from hugging face transformers
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding
)
#To split train and test data
from sklearn.model_selection import train_test_split
#Evaluation metrix to test model performance
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
#Deep learning framework
import torch
from torch.utils.data import Dataset
# ==================================================================================

# ==================================================================================
# Create PyTorch Dataset it will accept texts ("Je suis triste", "C'est beaucoup") and labels (0, 1, 2)
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)
# ==================================================================================

# ==================================================================================
# Split data into train/validation
train_texts = train_df['text'].tolist()
train_labels = train_df['label'].tolist()

X_train, X_val, y_train, y_val = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42, stratify=train_labels
)

print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
# ==================================================================================

# ==================================================================================
# Load AfroXLMR tokenizer and model
afroxlmr_tokenizer = AutoTokenizer.from_pretrained(afroxlmr_model)
afroxlmr_classifier = AutoModelForSequenceClassification.from_pretrained(
    afroxlmr_model,
    num_labels=3,  # negative, neutral, positive
    id2label=label_to_sentiment,
    label2id=sentiment_to_label
)
# ==================================================================================

# Create datasets
train_dataset = SentimentDataset(X_train, y_train, afroxlmr_tokenizer)
val_dataset = SentimentDataset(X_val, y_val, afroxlmr_tokenizer)


Training samples: 560
Validation samples: 140


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# ================================
# Train AfroXLMR Model
# ================================

# Define metrics computation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training arguments
# Note: Using updated parameter names for newer transformers versions
training_args = TrainingArguments(
    output_dir='./results_afroxlmr',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

# Create Trainer
trainer_afroxlmr = Trainer(
    model=afroxlmr_classifier,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

print("Starting AfroXLMR training...")

# Train the model
trainer_afroxlmr.train()

print("\nAfroXLMR training completed!")

# Evaluate
eval_results = trainer_afroxlmr.evaluate()
print(f"\nAfroXLMR Validation Results:")
for key, value in eval_results.items():
    print(f"   {key}: {value:.4f}")

Starting AfroXLMR training...




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.23718,0.95,0.925641,0.9025,0.95
2,0.592900,0.225907,0.95,0.925641,0.9025,0.95
3,0.228200,0.235247,0.95,0.925641,0.9025,0.95


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



AfroXLMR training completed!





AfroXLMR Validation Results:
   eval_loss: 0.2372
   eval_accuracy: 0.9500
   eval_f1: 0.9256
   eval_precision: 0.9025
   eval_recall: 0.9500
   eval_runtime: 2.0722
   eval_samples_per_second: 67.5610
   eval_steps_per_second: 4.3430
   epoch: 3.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
# ================================
# Enhanced AfroXLMR Sentiment Classifier Implementation
# ================================

import numpy as np
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
import pandas as pd

class AfroXLMRSentimentClassifier:
    def __init__(self, model_name="Davlan/afro-xlmr-base", num_labels=3, device=None):
        """
        Initialize AfroXLMR sentiment classifier with XAI support.
        
        Args:
            model_name (str): HuggingFace model name/path
            num_labels (int): Number of sentiment classes
            device (str): 'cuda' or 'cpu', will auto-detect if None
        """
        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")
        
        # Initialize with attention output enabled for XAI
        config = AutoConfig.from_pretrained(model_name, 
                                          num_labels=num_labels,
                                          output_attentions=True,
                                          output_hidden_states=True)
        
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            config=config
        ).to(self.device)
        
        self.id2label = {0: 'negative', 1: 'neutral', 2: 'positive'}
        self.label2id = {v: k for k, v in self.id2label.items()}

In [None]:
    def get_attention_weights(self, text):
        """
        Get attention weights for a given text input.
        
        Args:
            text (str): Input text for sentiment analysis
            
        Returns:
            dict: Contains tokens, attention weights, prediction and sentiment
        """
        # Tokenize and prepare input
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        # Get model outputs with attention
        with torch.no_grad():
            outputs = self.model(**inputs, output_attentions=True)
        
        # Get prediction
        logits = outputs.logits
        pred = torch.argmax(logits, dim=1).item()
        sentiment = self.id2label[pred]
        
        # Convert attention to CPU numpy
        attention = [layer.cpu().numpy() for layer in outputs.attentions]
        
        # Get tokens from input IDs
        tokens = self.tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
        
        return {
            'tokens': tokens,
            'attention': attention,
            'prediction': pred,
            'sentiment': sentiment
        }
    
    def visualize_attention(self, text, layer=-1, head=0, save_path=None):
        """
        Visualize attention weights for a given text input.
        
        Args:
            text (str): Input text for sentiment analysis
            layer (int): Transformer layer to visualize (-1 for last layer)
            head (int): Attention head to visualize
            save_path (str): Optional path to save the visualization
            
        Returns:
            numpy.ndarray: Attention weights matrix
        """
        # Get attention weights
        attention_data = self.get_attention_weights(text)
        tokens = attention_data['tokens']
        
        # Get attention matrix for specified layer and head
        attention_matrix = attention_data['attention'][layer][0, head]
        
        # Create figure
        plt.figure(figsize=(12, 10))
        
        # Create heatmap
        sns.heatmap(attention_matrix, 
                   xticklabels=tokens,
                   yticklabels=tokens,
                   cmap='YlOrRd',
                   cbar_kws={'label': 'Attention Weight'})
        
        # Rotate labels for better readability
        plt.xticks(rotation=45, ha='right')
        plt.yticks(rotation=0)
        
        # Add title with prediction
        plt.title(f'Attention Weights (Layer {layer+1}, Head {head+1})\n'
                 f'Predicted Sentiment: {attention_data["sentiment"]}')
        
        # Adjust layout
        plt.tight_layout()
        
        # Save if path provided
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            print(f"Visualization saved to: {save_path}")
            
        plt.show()
        return attention_matrix
    
    def analyze_important_words(self, text, top_k=5):
        """
        Analyze and return the most important words based on attention weights.
        
        Args:
            text (str): Input text for sentiment analysis
            top_k (int): Number of top words to return
            
        Returns:
            dict: Contains input text, sentiment, and top important words
        """
        # Get attention weights
        attention_data = self.get_attention_weights(text)
        tokens = attention_data['tokens']
        
        # Get attention from last layer
        last_layer_attention = attention_data['attention'][-1][0]  # shape: (num_heads, seq_len, seq_len)
        
        # Average across all heads
        mean_attention = last_layer_attention.mean(axis=0)  # shape: (seq_len, seq_len)
        
        # Get CLS token attention (first token's attention to all other tokens)
        token_importance = mean_attention[0]  # shape: (seq_len,)
        
        # Create token-importance pairs
        token_scores = []
        for idx, (token, score) in enumerate(zip(tokens, token_importance)):
            # Skip special tokens
            if token in ['<s>', '</s>', '<pad>', '<unk>']:
                continue
            token_scores.append({
                'token': token,
                'importance': float(score),
                'position': idx
            })
        
        # Sort by importance score
        token_scores.sort(key=lambda x: x['importance'], reverse=True)
        
        return {
            'text': text,
            'sentiment': attention_data['sentiment'],
            'top_words': token_scores[:top_k]
        }

In [None]:
    def predict_with_probabilities(self, texts):
        """
        Get probability distributions for predictions to use in ensemble methods.
        
        Args:
            texts (str or list): Input text(s) for sentiment analysis
            
        Returns:
            numpy.ndarray: Probability distributions, shape (num_texts, num_classes)
        """
        # Handle single text input
        if isinstance(texts, str):
            texts = [texts]
            
        # Tokenize inputs
        inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        # Get model outputs
        with torch.no_grad():
            outputs = self.model(**inputs)
            
        # Apply softmax to get probabilities
        probs = F.softmax(outputs.logits, dim=-1)
        
        # Convert to numpy array
        return probs.cpu().numpy()
    
    def get_logits(self, texts):
        """
        Get raw logits for ensemble methods that combine pre-softmax values.
        
        Args:
            texts (str or list): Input text(s) for sentiment analysis
            
        Returns:
            numpy.ndarray: Raw logits, shape (num_texts, num_classes)
        """
        # Handle single text input
        if isinstance(texts, str):
            texts = [texts]
            
        # Tokenize inputs
        inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        # Get model outputs
        with torch.no_grad():
            outputs = self.model(**inputs)
            
        # Convert to numpy array
        return outputs.logits.cpu().numpy()

In [None]:
    def evaluate_detailed(self, test_texts, test_labels, save_confusion_matrix=True):
        """
        Perform detailed evaluation including confusion matrix and classification report.
        
        Args:
            test_texts (list): List of texts to evaluate
            test_labels (list): True labels (can be strings or integers)
            save_confusion_matrix (bool): Whether to save confusion matrix plot
            
        Returns:
            dict: Dictionary containing evaluation metrics and predictions
        """
        # Convert string labels to integers if needed
        if isinstance(test_labels[0], str):
            test_labels = [self.label2id[label] for label in test_labels]
        
        # Get predictions
        probs = self.predict_with_probabilities(test_texts)
        predictions = np.argmax(probs, axis=1)
        
        # Calculate metrics
        conf_matrix = confusion_matrix(test_labels, predictions)
        report = classification_report(test_labels, predictions, 
                                    target_names=list(self.id2label.values()),
                                    output_dict=True)
        
        # Create confusion matrix visualization
        if save_confusion_matrix:
            plt.figure(figsize=(8, 6))
            sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                       xticklabels=list(self.id2label.values()),
                       yticklabels=list(self.id2label.values()))
            plt.title('Confusion Matrix')
            plt.ylabel('True Label')
            plt.xlabel('Predicted Label')
            plt.tight_layout()
            plt.savefig('afroxlmr_confusion_matrix.png', dpi=300, bbox_inches='tight')
            plt.close()
            print("Confusion matrix saved as 'afroxlmr_confusion_matrix.png'")
        
        # Print classification report
        print("\nClassification Report:")
        print(classification_report(test_labels, predictions, 
                                 target_names=list(self.id2label.values())))
        
        return {
            'accuracy': report['accuracy'],
            'precision': report['weighted avg']['precision'],
            'recall': report['weighted avg']['recall'],
            'f1': report['weighted avg']['f1-score'],
            'confusion_matrix': conf_matrix,
            'predictions': predictions,
            'detailed_report': report
        }
    
    def evaluate_per_language(self, test_texts, test_labels, languages):
        """
        Evaluate model performance separately for each language.
        
        Args:
            test_texts (list): List of texts to evaluate
            test_labels (list): True labels
            languages (list): List of language identifiers for each text
            
        Returns:
            dict: Dictionary with metrics for each language
        """
        results = {}
        unique_languages = set(languages)
        
        print("\nPer-Language Evaluation:")
        print("="*60)
        print(f"{'Language':<15} {'Samples':<8} {'Accuracy':<10} {'F1':<10}")
        print("-"*60)
        
        for lang in unique_languages:
            # Get indices for this language
            lang_indices = [i for i, l in enumerate(languages) if l == lang]
            if not lang_indices:
                continue
                
            # Filter data for this language
            lang_texts = [test_texts[i] for i in lang_indices]
            lang_labels = [test_labels[i] for i in lang_indices]
            
            # Get predictions
            probs = self.predict_with_probabilities(lang_texts)
            predictions = np.argmax(probs, axis=1)
            
            # Calculate metrics
            report = classification_report(lang_labels, predictions, 
                                        target_names=list(self.id2label.values()),
                                        output_dict=True)
            
            # Store results
            results[lang] = {
                'num_samples': len(lang_texts),
                'accuracy': report['accuracy'],
                'precision': report['weighted avg']['precision'],
                'recall': report['weighted avg']['recall'],
                'f1': report['weighted avg']['f1-score']
            }
            
            # Print results row
            print(f"{lang:<15} {len(lang_texts):<8} "
                  f"{results[lang]['accuracy']:.3f}  "
                  f"{results[lang]['f1']:.3f}")
        
        print("="*60)
        return results

In [None]:
    @staticmethod
    def load_from_lexicon(lexicon_path, target_languages=None):
        """
        Load data from multilingual sentiment lexicon.
        
        Args:
            lexicon_path (str): Path to lexicon CSV/Excel file
            target_languages (list): List of language columns to extract
            
        Returns:
            tuple: (texts_list, labels_list, languages_list)
        """
        # Default languages if none specified
        if target_languages is None:
            target_languages = ['zulu', 'xhosa', 'sepedi', 'shona', 'afrikaans', 'english']
            
        # Read lexicon file
        if lexicon_path.endswith('.csv'):
            df = pd.read_csv(lexicon_path)
        else:
            df = pd.read_excel(lexicon_path)
            
        texts, labels, languages = [], [], []
        
        # Map sentiment labels to integers
        sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
        
        # Process each language
        for lang in target_languages:
            if lang not in df.columns:
                print(f"Warning: Language '{lang}' not found in lexicon")
                continue
                
            # Get non-empty entries
            lang_data = df[[lang, 'sentiment']].dropna()
            lang_data = lang_data[lang_data[lang].str.strip() != '']
            
            if len(lang_data) == 0:
                print(f"Warning: No valid entries found for '{lang}'")
                continue
                
            # Add to lists
            texts.extend(lang_data[lang].tolist())
            labels.extend([sentiment_map.get(s.lower(), 1) for s in lang_data['sentiment']])
            languages.extend([lang] * len(lang_data))
            
        print(f"\nLoaded from lexicon:")
        print(f"Total entries: {len(texts)}")
        print("\nLanguage distribution:")
        for lang in set(languages):
            count = languages.count(lang)
            print(f"{lang}: {count} entries")
            
        return texts, labels, languages
    
    @staticmethod
    def load_from_corpus(corpus_path, text_column='text', label_column='sentiment', language_column=None):
        """
        Load data from a sentiment corpus file.
        
        Args:
            corpus_path (str): Path to corpus CSV/Excel file
            text_column (str): Name of column containing text
            label_column (str): Name of column containing sentiment labels
            language_column (str): Optional column name for language labels
            
        Returns:
            tuple: (texts, labels, languages)
        """
        # Read corpus file
        if corpus_path.endswith('.csv'):
            df = pd.read_csv(corpus_path)
        else:
            df = pd.read_excel(corpus_path)
            
        # Verify required columns exist
        if text_column not in df.columns or label_column not in df.columns:
            raise ValueError(f"Required columns not found. Available columns: {df.columns.tolist()}")
            
        # Map sentiment labels to integers
        sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
        
        # Extract data
        texts = df[text_column].tolist()
        labels = [sentiment_map.get(str(s).lower(), 1) for s in df[label_column]]
        
        # Get languages if column specified
        languages = None
        if language_column and language_column in df.columns:
            languages = df[language_column].tolist()
            
        print(f"\nLoaded from corpus:")
        print(f"Total entries: {len(texts)}")
        if languages:
            print("\nLanguage distribution:")
            lang_counts = pd.Series(languages).value_counts()
            for lang, count in lang_counts.items():
                print(f"{lang}: {count} entries")
                
        return texts, labels, languages
    
    @staticmethod
    def combine_datasets(lexicon_data, corpus_data):
        """
        Combine data from lexicon and corpus sources.
        
        Args:
            lexicon_data (tuple): (texts, labels, languages) from lexicon
            corpus_data (tuple): (texts, labels, languages) from corpus
            
        Returns:
            tuple: (combined_texts, combined_labels, combined_languages)
        """
        # Unpack data
        lex_texts, lex_labels, lex_langs = lexicon_data
        corp_texts, corp_labels, corp_langs = corpus_data
        
        # Combine texts and labels
        texts = lex_texts + corp_texts
        labels = lex_labels + corp_labels
        
        # Handle languages (might be None from corpus)
        if corp_langs is None:
            corp_langs = ['unknown'] * len(corp_texts)
        languages = lex_langs + corp_langs
        
        # Create DataFrame for easy manipulation
        df = pd.DataFrame({
            'text': texts,
            'label': labels,
            'language': languages
        })
        
        # Remove duplicates
        df = df.drop_duplicates(subset=['text'])
        
        # Print statistics
        print("\nCombined Dataset Statistics:")
        print(f"Total samples: {len(df)}")
        print("\nLabel distribution:")
        print(df['label'].value_counts().sort_index())
        print("\nLanguage distribution:")
        print(df['language'].value_counts())
        
        return df['text'].tolist(), df['label'].tolist(), df['language'].tolist()

In [None]:
    def train(self, train_texts, train_labels, validation_texts=None, validation_labels=None,
            languages=None, batch_size=16, num_epochs=3, learning_rate=2e-5):
        """
        Train the model with enhanced logging and language-aware evaluation.
        
        Args:
            train_texts (list): Training texts
            train_labels (list): Training labels
            validation_texts (list): Optional validation texts
            validation_labels (list): Optional validation labels
            languages (list): Optional list of language identifiers for texts
            batch_size (int): Batch size for training
            num_epochs (int): Number of training epochs
            learning_rate (float): Learning rate for optimization
        """
        print("\n" + "="*50)
        print("AfroXLMR Training Configuration")
        print("="*50)
        print(f"Model: {self.model.config._name_or_path}")
        print(f"Training samples: {len(train_texts)}")
        print(f"Validation samples: {len(validation_texts) if validation_texts else 'None'}")
        print(f"Batch size: {batch_size}")
        print(f"Epochs: {num_epochs}")
        print(f"Learning rate: {learning_rate}")
        print(f"Device: {self.device}")
        
        if languages:
            print("\nLanguage Distribution:")
            lang_counts = pd.Series(languages).value_counts()
            for lang, count in lang_counts.items():
                print(f"{lang}: {count} samples")
                
        print("\nLabel Distribution:")
        label_counts = pd.Series(train_labels).value_counts().sort_index()
        for label, count in label_counts.items():
            print(f"{self.id2label[label]}: {count} samples")
            
        print("="*50)
        
        # Create PyTorch datasets
        train_dataset = SentimentDataset(train_texts, train_labels, self.tokenizer)
        if validation_texts:
            val_dataset = SentimentDataset(validation_texts, validation_labels, self.tokenizer)
        
        # Training arguments
        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=num_epochs,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            warmup_steps=100,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=10,
            eval_steps=100,
            save_steps=100,
            load_best_model_at_end=True
        )
        
        # Create trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset if validation_texts else None,
            compute_metrics=lambda p: {
                'accuracy': accuracy_score(p.label_ids, p.predictions.argmax(-1))
            }
        )
        
        # Train
        trainer.train()
        
        # Final evaluation
        if validation_texts:
            print("\nFinal Validation Results:")
            self.evaluate_detailed(validation_texts, validation_labels)
            
            if languages:
                print("\nPer-Language Validation Results:")
                self.evaluate_per_language(validation_texts, validation_labels, 
                                        languages[-len(validation_texts):])
                
        print("\n" + "="*50)
        print("Training completed!")
        print("="*50)

In [None]:
# ================================
# Comprehensive Example Usage
# ================================

# Initialize classifier
classifier = AfroXLMRSentimentClassifier()

# 1. Load data from lexicon and corpus
print("Loading data...")
lexicon_data = AfroXLMRSentimentClassifier.load_from_lexicon(
    'corrected_lexicon.xlsx',
    target_languages=['zulu', 'xhosa', 'sepedi', 'afrikaans', 'english']
)

corpus_data = AfroXLMRSentimentClassifier.load_from_corpus(
    'french_test_corpus.xlsx',
    text_column='text',
    label_column='sentiment',
    language_column='language'
)

# 2. Combine datasets
texts, labels, languages = AfroXLMRSentimentClassifier.combine_datasets(lexicon_data, corpus_data)

# 3. Split into train/validation
X_train, X_val, y_train, y_val, langs_train, langs_val = train_test_split(
    texts, labels, languages, test_size=0.2, random_state=42, stratify=labels
)

# 4. Train the model
classifier.train(
    train_texts=X_train,
    train_labels=y_train,
    validation_texts=X_val,
    validation_labels=y_val,
    languages=langs_train,
    batch_size=16,
    num_epochs=3
)

# 5. XAI Example - Visualize attention
example_text = "I am very happy with the results"
classifier.visualize_attention(
    example_text,
    layer=-1,  # last layer
    head=0,    # first attention head
    save_path='attention_viz.png'
)

# 6. Analyze important words
important_words = classifier.analyze_important_words(example_text, top_k=3)
print("\nImportant words analysis:")
print(f"Text: {important_words['text']}")
print(f"Sentiment: {important_words['sentiment']}")
print("Top words:")
for word in important_words['top_words']:
    print(f"- {word['token']}: {word['importance']:.3f}")

# 7. Ensemble prediction example
texts_to_predict = [
    "This is excellent work",
    "I'm not sure about this",
    "This is terrible"
]

# Get probabilities for ensemble methods
probs = classifier.predict_with_probabilities(texts_to_predict)
print("\nPrediction probabilities:")
for text, prob in zip(texts_to_predict, probs):
    print(f"\nText: {text}")
    for i, p in enumerate(prob):
        print(f"{classifier.id2label[i]}: {p:.3f}")

# 8. Detailed evaluation
test_texts = [
    "The results are amazing",
    "I don't like this approach",
    "This seems okay to me"
]
test_labels = [2, 0, 1]  # positive, negative, neutral

print("\nDetailed Evaluation:")
eval_results = classifier.evaluate_detailed(
    test_texts,
    test_labels,
    save_confusion_matrix=True
)

# 9. Per-language evaluation
test_languages = ['english', 'zulu', 'english']
print("\nPer-Language Evaluation:")
lang_results = classifier.evaluate_per_language(
    test_texts,
    test_labels,
    test_languages
)

# Initializing the AfroXLMR Sentiment Classifier

The classifier can be initialized with the following parameters:

1. `model_name`: The HuggingFace model name (default: "Davlan/afro-xlmr-base")
2. `num_labels`: Number of sentiment classes (default: 3 for negative/neutral/positive)
3. `device`: Computing device (default: will auto-detect CUDA/CPU)

The initialization will:
- Load the model and tokenizer
- Enable attention outputs for XAI features
- Set up sentiment label mappings
- Move the model to the appropriate device (GPU if available)

In [None]:
# Initialize the classifier
classifier = AfroXLMRSentimentClassifier()

# You can also specify parameters explicitly:
# classifier = AfroXLMRSentimentClassifier(
#     model_name="Davlan/afro-xlmr-base",
#     num_labels=3,
#     device="cuda"  # or "cpu"
# )

# Test the classifier with a simple example
text = "I am very happy with the results!"
attention_data = classifier.get_attention_weights(text)
print(f"Input text: {text}")
print(f"Predicted sentiment: {attention_data['sentiment']}")

# Visualize attention for the example
classifier.visualize_attention(text, save_path='example_attention.png')

# Analyze important words
important_words = classifier.analyze_important_words(text, top_k=3)
print("\nMost important words:")
for word in important_words['top_words']:
    print(f"- {word['token']}: importance = {word['importance']:.3f}")

In [None]:
#BELOW I WILL VISUALISE AfroXLMR, the issue is the data is very biased mostly positive so the model will definetely reflect that:
#Positive (label 2): 8,599 examples (83.4%)
#Negative (label 0): 980 examples (9.5%)
#Neutral (label 1): 875 examples (8.5%)

#Maybe we use class weights, or use data augmentation to balance the classes
