In [None]:
# =========================================================
# 1. IMPORTS AND SETUP
# =========================================================
import os
import torch
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from functools import partial
import random

# =========================================================
# 2. CONFIGURATION
# =========================================================
DATA_PATH     = "expanded_lexicon.xlsx"
LANGUAGE_COLS = ["Zulu", "Sepedi", "Xhosa"]   # üëà African languages for training/eval
ENGLISH_COL   = "English"                     # for translation display
LABEL_COL     = "Sentiment"
MODEL_NAME    = "castorini/afriberta_base"
MAX_LEN       = 128
BATCH_SIZE    = 8
NUM_EPOCHS    = 5
LEARNING_RATE = 2e-5
SEED          = 42

os.environ["TOKENIZERS_PARALLELISM"] = "false"
random.seed(SEED)

# =========================================================
# 3. DATA PREPARATION (train only on African languages)
# =========================================================
def load_african_data():
    df = pd.read_excel(DATA_PATH, engine="openpyxl")
    for col in LANGUAGE_COLS + [ENGLISH_COL, LABEL_COL]:
        if col not in df.columns:
            raise ValueError(f"Missing required column: {col}")

    label_encoder = LabelEncoder()
    df["label"] = label_encoder.fit_transform(df[LABEL_COL])

    # Combine all African language data into one DataFrame
    dfs = []
    for lang in LANGUAGE_COLS:
        temp = df[[lang, LABEL_COL, "label"]].dropna().copy()
        temp = temp.rename(columns={lang: "text"})
        temp["language"] = lang
        dfs.append(temp)

    full_lang_df = pd.concat(dfs, ignore_index=True)

    # Split train/val
    train_df, val_df = train_test_split(
        full_lang_df,
        test_size=0.2,
        random_state=SEED,
        stratify=full_lang_df["label"]
    )

    train_ds = Dataset.from_pandas(train_df)
    val_ds   = Dataset.from_pandas(val_df)
    return train_ds, val_ds, label_encoder, df

# =========================================================
# 4. TOKENIZATION
# =========================================================
def tokenize_function(batch, tokenizer, max_len):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=max_len
    )

# =========================================================
# 5. METRICS
# =========================================================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

# =========================================================
# 6. SENTENCE GENERATION AND EVALUATION
# =========================================================
def generate_sentences_and_evaluate(trainer, df, label_encoder, tokenizer):
    print("\nüß© Generating and evaluating lexicon-based sentences...\n")

    results = []

    for lang in LANGUAGE_COLS:
        if lang not in df.columns:
            print(f"‚ö†Ô∏è Skipping {lang} - column not found")
            continue

        # Create short synthetic sentences using translated words
        lang_words = df[lang].dropna().tolist()
        eng_words  = df[ENGLISH_COL].dropna().tolist()

        n = min(len(lang_words), len(eng_words))
        lang_words = lang_words[:n]
        eng_words = eng_words[:n]

        # Randomly create 25 short sentences
        num_sentences = 25
        for _ in range(num_sentences):
            idxs = random.sample(range(n), k=min(5, n))
            afr_sentence = " ".join([lang_words[i] for i in idxs])
            eng_sentence = " ".join([eng_words[i] for i in idxs])

            # Tokenize and predict
            inputs = tokenizer(
                afr_sentence,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=MAX_LEN
            )
            with torch.no_grad():
                outputs = trainer.model(**inputs)
                probs = torch.nn.functional.softmax(outputs.logits, dim=1)
                pred_label = torch.argmax(probs, dim=1).item()
                pred_sentiment = label_encoder.inverse_transform([pred_label])[0]

            results.append({
                "Language": lang,
                "African_Sentence": afr_sentence,
                "English_Translation": eng_sentence,
                "Predicted_Sentiment": pred_sentiment
            })

    # Save to CSV
    out_df = pd.DataFrame(results)
    out_path = "african_sentence_evaluation_results.csv"
    out_df.to_csv(out_path, index=False, encoding="utf-8")
    print(f"‚úÖ Results saved to: {out_path}\n")

    # Show sample outputs
    print(out_df.head(10).to_string(index=False))

# =========================================================
# 7. MAIN FUNCTION
# =========================================================
def main():
    print("üöÄ Loading African language data...")
    train_ds, val_ds, label_encoder, full_df = load_african_data()

    print("üî§ Loading tokenizer and model...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)

    num_proc = min(4, os.cpu_count() or 1)
    tokenize_partial = partial(tokenize_function, tokenizer=tokenizer, max_len=MAX_LEN)

    print("üìù Tokenizing datasets...")
    train_ds = train_ds.map(tokenize_partial, batched=True, num_proc=num_proc)
    val_ds   = val_ds.map(tokenize_partial, batched=True, num_proc=num_proc)

    train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
    val_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    print("üß† Initializing model...")
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(label_encoder.classes_)
    )

    training_args = TrainingArguments(
        output_dir="./results_african",
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE * 2,
        learning_rate=LEARNING_RATE,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        logging_steps=100,
        warmup_ratio=0.06,
        fp16=torch.cuda.is_available(),
        save_total_limit=2,
        dataloader_num_workers=num_proc,
        report_to="none",
        seed=SEED
    )

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    print("üèãÔ∏è Training AfriBERTa on African languages...")
    trainer.train()

    print("‚úÖ Evaluating best model on validation set...")
    eval_results = trainer.evaluate()
    print("\n=== African Languages Validation Results ===")
    print(eval_results)

    trainer.save_model("./final_afriberta_african_model")

    # ---------------------------------------------------------
    # Generate and evaluate synthetic sentences
    # ---------------------------------------------------------
    generate_sentences_and_evaluate(trainer, full_df, label_encoder, tokenizer)

# =========================================================
# 8. ENTRY POINT
# =========================================================
if __name__ == "__main__":
    main()