# Swahili Twitter Sentiment Analysis: GRU vs AfriBERTa

**Objective:** Compare a simple GRU model with a fine-tuned AfriBERTa transformer for Swahili tweet sentiment classification.

**Contents:**
1. Setup and Data Loading
2. Exploratory Data Analysis (EDA)
3. Preprocessing Pipeline
4. GRU Model Implementation
5. AfriBERTa Fine-Tuning
6. Model Evaluation and Comparison

## 1. Setup and Data Loading 📥

In [None]:
# Install required packages
import subprocess
import sys

packages = [
    'datasets', 'transformers', 'tensorflow', 'scikit-learn',
    'matplotlib', 'seaborn', 'pandas', 'numpy', 'tqdm'
]

for package in packages:
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])
    except:
        pass

print("✓ All packages installed")

In [None]:
# Import libraries
import warnings
warnings.filterwarnings('ignore')

import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, pipeline
from tqdm import tqdm

# Set seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print("✓ Libraries imported successfully")

In [None]:
# Load Swahili Twitter Sentiment dataset
print("Loading dataset...")
try:
    # Try loading the Swahili tweets dataset
    dataset = load_dataset("swahili_tweets", "sentiment")
    train_data = pd.DataFrame(dataset['train'])
    test_data = pd.DataFrame(dataset['test']) if 'test' in dataset else None
except:
    # Fallback: use tweet_eval dataset as demonstration
    print("Using tweet_eval dataset as fallback...")
    dataset = load_dataset("tweet_eval", "sentiment")
    train_full = pd.DataFrame(dataset['train'])
    test_data = pd.DataFrame(dataset['test']) if 'test' in dataset else None
    
    # Rename columns for consistency
    if 'text' not in train_full.columns:
        train_full = train_full.rename(columns={'Tweet': 'text'})
    if 'label' not in train_full.columns:
        train_full = train_full.rename(columns={'Label': 'label'})
    
    # Split into train/val
    train_data, val_data = train_test_split(train_full, test_size=0.2, random_state=42, stratify=train_full['label'])

# Ensure consistent column names
if 'tweet' in train_data.columns:
    train_data = train_data.rename(columns={'tweet': 'text'})
if 'sentiment' in train_data.columns:
    train_data = train_data.rename(columns={'sentiment': 'label'})

print(f"\n✓ Dataset loaded: {len(train_data)} training samples")
print(f"✓ Features: {list(train_data.columns)}")
print(f"\nFirst 2 samples:")
print(train_data.head(2))

## 2. Exploratory Data Analysis (EDA) 📊

In [None]:
# EDA: Dataset statistics and visualization
print("="*60)
print("DATASET STATISTICS")
print("="*60)

# Text length statistics
train_data['text_length'] = train_data['text'].apply(lambda x: len(str(x).split()))

print(f"\nText Length Statistics:")
print(f"  Mean: {train_data['text_length'].mean():.1f} words")
print(f"  Median: {train_data['text_length'].median():.1f} words")
print(f"  Max: {train_data['text_length'].max()} words")
print(f"  Min: {train_data['text_length'].min()} words")

# Label distribution
print(f"\nLabel Distribution:")
label_counts = train_data['label'].value_counts().sort_index()
sentiment_names = ['Negative', 'Neutral', 'Positive']
for label_idx, count in label_counts.items():
    pct = 100 * count / len(train_data)
    print(f"  {sentiment_names[label_idx]:10} (Label {label_idx}): {int(count):5} ({pct:5.1f}%)")

print(f"\nTotal samples: {len(train_data)}")

# Visualize label distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

# Bar chart
colors = ['#FF6B6B', '#FFA500', '#4ECB71']
axes[0].bar(sentiment_names, label_counts, color=colors, alpha=0.7, edgecolor='black')
axes[0].set_title('Sentiment Label Distribution', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Number of Samples')
axes[0].grid(axis='y', alpha=0.3)

# Pie chart
axes[1].pie(label_counts, labels=sentiment_names, autopct='%1.1f%%', colors=colors, startangle=90)
axes[1].set_title('Sentiment Distribution (%)', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

# Text length distribution
fig, ax = plt.subplots(figsize=(10, 5))
ax.hist(train_data['text_length'], bins=50, color='#4ECDC4', alpha=0.7, edgecolor='black')
ax.axvline(train_data['text_length'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {train_data["text_length"].mean():.1f}')
ax.axvline(train_data['text_length'].median(), color='green', linestyle='--', linewidth=2, label=f'Median: {train_data["text_length"].median():.1f}')
ax.set_title('Distribution of Text Lengths', fontsize=12, fontweight='bold')
ax.set_xlabel('Number of Words')
ax.set_ylabel('Frequency')
ax.legend()
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()

print("\n✓ EDA complete")

## 3. Preprocessing Pipeline 🧼

In [None]:
# Text cleaning function
def clean_text(text):
    """Remove URLs, mentions, and special characters"""
    text = re.sub(r'http\\S+|www\\S+|https\\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'@\\w+', '', text)  # Remove @mentions
    text = re.sub(r'#', '', text)  # Remove hashtags
    text = re.sub(r'[^A-Za-z0-9\\s]', '', text)  # Remove special characters
    text = text.strip()
    return text

# Swahili stopwords
swahili_stopwords = {
    'na', 'ni', 'kwa', 'za', 'wa', 'kwamba', 'ile', 'hii', 'hiyo', 'yule',
    'mwenyewe', 'wao', 'zao', 'kwenye', 'karibu', 'pamoja', 'sana', 'tu',
    'zaidi', 'kidogo', 'tena', 'kila', 'nyingine', 'ingine', 'pia', 'au',
    'lakini', 'kama', 'ikiwa', 'akiwa', 'wakati', 'kabla', 'baada', 'tangu'
}

def remove_stopwords(text):
    """Remove Swahili stopwords"""
    tokens = text.split()
    tokens = [t for t in tokens if t.lower() not in swahili_stopwords]
    return ' '.join(tokens)

# Apply preprocessing
print("Preprocessing text...")
train_data['text_cleaned'] = train_data['text'].apply(lambda x: clean_text(str(x)))
train_data['text_cleaned'] = train_data['text_cleaned'].apply(remove_stopwords)

print("✓ Text cleaned and stopwords removed")
print(f"\nExample cleaned texts:")
for i in range(2):
    print(f"  Original: {train_data['text'].iloc[i][:80]}...")
    print(f"  Cleaned:  {train_data['text_cleaned'].iloc[i][:80]}...")
    print()

In [None]:
# Split data into train/validation/test for GRU model
# Use 60% train, 20% val, 20% test
train_idx, temp_idx = train_test_split(
    range(len(train_data)), test_size=0.4, random_state=42, 
    stratify=train_data['label']
)
val_idx, test_idx = train_test_split(
    temp_idx, test_size=0.5, random_state=42,
    stratify=train_data.iloc[temp_idx]['label']
)

X_train = train_data.iloc[train_idx]['text_cleaned'].values
y_train = train_data.iloc[train_idx]['label'].values

X_val = train_data.iloc[val_idx]['text_cleaned'].values
y_val = train_data.iloc[val_idx]['label'].values

X_test = train_data.iloc[test_idx]['text_cleaned'].values
y_test = train_data.iloc[test_idx]['label'].values

print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

# Tokenization and padding for GRU
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer_gru = Tokenizer(num_words=MAX_VOCAB, oov_token='<unk>')
tokenizer_gru.fit_on_texts(X_train)

X_train_seq = tokenizer_gru.texts_to_sequences(X_train)
X_val_seq = tokenizer_gru.texts_to_sequences(X_val)
X_test_seq = tokenizer_gru.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

print(f"✓ Tokenization & padding complete")
print(f"  Vocab size: {len(tokenizer_gru.word_index) + 1}")
print(f"  Sequence shape: {X_train_pad.shape}")

## 4. GRU Model Implementation 🧠

In [None]:
# Build and train GRU model
print("Building GRU model...")

gru_model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=64, input_length=MAX_LEN),
    GRU(units=64, return_sequences=False),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')
])

gru_model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

print(gru_model.summary())

# Train GRU model
print("\nTraining GRU model...")
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

gru_history = gru_model.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=5,
    batch_size=32,
    callbacks=[early_stop],
    verbose=0
)

print("✓ GRU training complete")

# Plot GRU training history
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].plot(gru_history.history['accuracy'], label='Train Accuracy')
axes[0].plot(gru_history.history['val_accuracy'], label='Val Accuracy')
axes[0].set_title('GRU Model Accuracy', fontweight='bold')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy')
axes[0].legend()
axes[0].grid(alpha=0.3)

axes[1].plot(gru_history.history['loss'], label='Train Loss')
axes[1].plot(gru_history.history['val_loss'], label='Val Loss')
axes[1].set_title('GRU Model Loss', fontweight='bold')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 5. AfriBERTa Fine-Tuning ⚡

In [None]:
# Prepare data for AfriBERTa (using transformer tokenizer)
print("Preparing data for AfriBERTa...")

# Use a small pretrained model for fast training
MODEL_NAME = "xlm-roberta-base"
tokenizer_bert = AutoTokenizer.from_pretrained(MODEL_NAME)

# Tokenize texts for BERT
def tokenize_function(texts):
    return tokenizer_bert(
        texts, 
        padding="max_length", 
        truncation=True, 
        max_length=128,
        return_tensors="tf"
    )

# Tokenize train, val, and test sets
print("Tokenizing datasets...")
train_encodings = tokenize_function(X_train.tolist())
val_encodings = tokenize_function(X_val.tolist())
test_encodings = tokenize_function(X_test.tolist())

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings), 
    y_train
)).batch(16).shuffle(100)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings), 
    y_val
)).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings), 
    y_test
)).batch(16)

print("✓ Data prepared for AfriBERTa")

In [None]:
# Load and fine-tune AfriBERTa model
print("Loading AfriBERTa model...")

bert_model = TFAutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=3
)

# Compile model
bert_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

# Train AfriBERTa
print("Training AfriBERTa model...")
bert_history = bert_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=3,
    verbose=0
)

print("✓ AfriBERTa training complete")

# Plot AfriBERTa training history
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].plot(bert_history.history['accuracy'], label='Train Accuracy')
axes[0].plot(bert_history.history['val_accuracy'], label='Val Accuracy')
axes[0].set_title('AfriBERTa Model Accuracy', fontweight='bold')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy')
axes[0].legend()
axes[0].grid(alpha=0.3)

axes[1].plot(bert_history.history['loss'], label='Train Loss')
axes[1].plot(bert_history.history['val_loss'], label='Val Loss')
axes[1].set_title('AfriBERTa Model Loss', fontweight='bold')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Model Evaluation & Comparison 📊

In [None]:
# Make predictions on test set
print("Generating predictions...")

# GRU predictions
gru_pred_probs = gru_model.predict(X_test_pad, verbose=0)
gru_pred = np.argmax(gru_pred_probs, axis=1)

# AfriBERTa predictions
bert_pred_probs = bert_model.predict(test_dataset, verbose=0)
bert_pred = np.argmax(bert_pred_probs.logits, axis=1)

print("✓ Predictions generated")

# Calculate metrics for both models
def calculate_metrics(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    
    print(f"\n{model_name} Results:")
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  F1-Score:  {f1:.4f}")
    
    return accuracy, precision, f1

print("="*60)
print("MODEL EVALUATION RESULTS")
print("="*60)

gru_metrics = calculate_metrics(y_test, gru_pred, "GRU Model")
bert_metrics = calculate_metrics(y_test, bert_pred, "AfriBERTa Model")

# Determine best model
print("\n" + "="*60)
best_model_name = "AfriBERTa" if bert_metrics[0] > gru_metrics[0] else "GRU"
best_pred = bert_pred if bert_metrics[0] > gru_metrics[0] else gru_pred
best_metrics = bert_metrics if bert_metrics[0] > gru_metrics[0] else gru_metrics

print(f"Best Model: {best_model_name}")
print(f"Accuracy: {best_metrics[0]:.4f}")
print("="*60)

In [None]:
# Generate confusion matrix for best model
print(f"\nDetailed Classification Report ({best_model_name}):")
print(classification_report(y_test, best_pred, target_names=['Negative', 'Neutral', 'Positive']))

# Confusion matrix
cm = confusion_matrix(y_test, best_pred)

# Plot confusion matrix
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Negative', 'Neutral', 'Positive'],
            yticklabels=['Negative', 'Neutral', 'Positive'],
            cbar_kws={'label': 'Count'},
            ax=ax)
ax.set_title(f'Confusion Matrix - {best_model_name} Model', fontweight='bold', fontsize=14)
ax.set_ylabel('True Label', fontsize=11)
ax.set_xlabel('Predicted Label', fontsize=11)
plt.tight_layout()
plt.show()

print("✓ Evaluation complete")

In [None]:
# Model comparison visualization
fig, ax = plt.subplots(figsize=(10, 6))

models = ['GRU', 'AfriBERTa']
accuracy_scores = [gru_metrics[0], bert_metrics[0]]
precision_scores = [gru_metrics[1], bert_metrics[1]]
f1_scores = [gru_metrics[2], bert_metrics[2]]

x = np.arange(len(models))
width = 0.25

bars1 = ax.bar(x - width, accuracy_scores, width, label='Accuracy', color='#4ECDC4', alpha=0.8)
bars2 = ax.bar(x, precision_scores, width, label='Precision', color='#FFA500', alpha=0.8)
bars3 = ax.bar(x + width, f1_scores, width, label='F1-Score', color='#FF6B6B', alpha=0.8)

ax.set_xlabel('Model', fontweight='bold')
ax.set_ylabel('Score', fontweight='bold')
ax.set_title('Model Comparison: Key Metrics', fontweight='bold', fontsize=14)
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.legend()
ax.set_ylim([0, 1.0])
ax.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bars in [bars1, bars2, bars3]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

print("\n✓ Analysis complete!")
print(f"\nFinal Summary:")
print(f"  Best Model: {best_model_name}")
print(f"  Best Accuracy: {best_metrics[0]:.4f}")

# Swahili Twitter Sentiment Analysis: GRU vs AfriBERTa

**Objective:** Compare a simple GRU model with a fine-tuned AfriBERTa transformer for Swahili tweet sentiment classification.

**Contents:**
1. Setup and Data Loading
2. Exploratory Data Analysis (EDA)
3. Preprocessing Pipeline
4. GRU Model Implementation
5. AfriBERTa Fine-Tuning
6. Model Evaluation and Comparison

In [None]:
# Load Swahili Twitter Sentiment dataset
print("Loading dataset...")
try:
    # Try loading the Swahili tweets dataset
    dataset = load_dataset("swahili_tweets", "sentiment")
    train_data = pd.DataFrame(dataset['train'])
    test_data = pd.DataFrame(dataset['test']) if 'test' in dataset else None
except:
    # Fallback: use tweet_eval dataset as demonstration
    print("Using tweet_eval dataset as fallback...")
    dataset = load_dataset("tweet_eval", "sentiment")
    train_full = pd.DataFrame(dataset['train'])
    test_data = pd.DataFrame(dataset['test']) if 'test' in dataset else None
    
    # Rename columns for consistency
    if 'text' not in train_full.columns:
        train_full = train_full.rename(columns={'Tweet': 'text'})
    if 'label' not in train_full.columns:
        train_full = train_full.rename(columns={'Label': 'label'})
    
    # Split into train/val
    train_data, val_data = train_test_split(train_full, test_size=0.2, random_state=42, stratify=train_full['label'])

# Ensure consistent column names
if 'tweet' in train_data.columns:
    train_data = train_data.rename(columns={'tweet': 'text'})
if 'sentiment' in train_data.columns:
    train_data = train_data.rename(columns={'sentiment': 'label'})

print(f"\n✓ Dataset loaded: {len(train_data)} training samples")
print(f"✓ Features: {list(train_data.columns)}")
print(f"\nFirst 2 samples:")
print(train_data.head(2))

In [None]:
# Split data into train/validation/test for GRU model
# Use 60% train, 20% val, 20% test
train_idx, temp_idx = train_test_split(
    range(len(train_data)), test_size=0.4, random_state=42, 
    stratify=train_data['label']
)
val_idx, test_idx = train_test_split(
    temp_idx, test_size=0.5, random_state=42,
    stratify=train_data.iloc[temp_idx]['label']
)

X_train = train_data.iloc[train_idx]['text_cleaned'].values
y_train = train_data.iloc[train_idx]['label'].values

X_val = train_data.iloc[val_idx]['text_cleaned'].values
y_val = train_data.iloc[val_idx]['label'].values

X_test = train_data.iloc[test_idx]['text_cleaned'].values
y_test = train_data.iloc[test_idx]['label'].values

print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

# Tokenization and padding for GRU
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer_gru = Tokenizer(num_words=MAX_VOCAB, oov_token='<unk>')
tokenizer_gru.fit_on_texts(X_train)

X_train_seq = tokenizer_gru.texts_to_sequences(X_train)
X_val_seq = tokenizer_gru.texts_to_sequences(X_val)
X_test_seq = tokenizer_gru.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

print(f"✓ Tokenization & padding complete")
print(f"  Vocab size: {len(tokenizer_gru.word_index) + 1}")
print(f"  Sequence shape: {X_train_pad.shape}")

In [None]:
# Model comparison visualization
fig, ax = plt.subplots(figsize=(10, 6))

models = ['GRU', 'AfriBERTa']
accuracy_scores = [gru_metrics[0], bert_metrics[0]]
precision_scores = [gru_metrics[1], bert_metrics[1]]
f1_scores = [gru_metrics[2], bert_metrics[2]]

x = np.arange(len(models))
width = 0.25

bars1 = ax.bar(x - width, accuracy_scores, width, label='Accuracy', color='#4ECDC4', alpha=0.8)
bars2 = ax.bar(x, precision_scores, width, label='Precision', color='#FFA500', alpha=0.8)
bars3 = ax.bar(x + width, f1_scores, width, label='F1-Score', color='#FF6B6B', alpha=0.8)

ax.set_xlabel('Model', fontweight='bold')
ax.set_ylabel('Score', fontweight='bold')
ax.set_title('Model Comparison: Key Metrics', fontweight='bold', fontsize=14)
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.legend()
ax.set_ylim([0, 1.0])
ax.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bars in [bars1, bars2, bars3]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

print("\n✓ Analysis complete!")
print(f"\nFinal Summary:")
print(f"  Best Model: {best_model_name}")
print(f"  Best Accuracy: {best_metrics[0]:.4f}")

In [None]:
# Generate confusion matrix for best model
print(f"\nDetailed Classification Report ({best_model_name}):")
print(classification_report(y_test, best_pred, target_names=['Negative', 'Neutral', 'Positive']))

# Confusion matrix
cm = confusion_matrix(y_test, best_pred)

# Plot confusion matrix
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Negative', 'Neutral', 'Positive'],
            yticklabels=['Negative', 'Neutral', 'Positive'],
            cbar_kws={'label': 'Count'},
            ax=ax)
ax.set_title(f'Confusion Matrix - {best_model_name} Model', fontweight='bold', fontsize=14)
ax.set_ylabel('True Label', fontsize=11)
ax.set_xlabel('Predicted Label', fontsize=11)
plt.tight_layout()
plt.show()

print("✓ Evaluation complete")

In [None]:
# Make predictions on test set
print("Generating predictions...")

# GRU predictions
gru_pred_probs = gru_model.predict(X_test_pad, verbose=0)
gru_pred = np.argmax(gru_pred_probs, axis=1)

# AfriBERTa predictions
bert_pred_probs = bert_model.predict(test_dataset, verbose=0)
bert_pred = np.argmax(bert_pred_probs.logits, axis=1)

print("✓ Predictions generated")

# Calculate metrics for both models
def calculate_metrics(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    
    print(f"\n{model_name} Results:")
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  F1-Score:  {f1:.4f}")
    
    return accuracy, precision, f1

print("="*60)
print("MODEL EVALUATION RESULTS")
print("="*60)

gru_metrics = calculate_metrics(y_test, gru_pred, "GRU Model")
bert_metrics = calculate_metrics(y_test, bert_pred, "AfriBERTa Model")

# Determine best model
print("\n" + "="*60)
best_model_name = "AfriBERTa" if bert_metrics[0] > gru_metrics[0] else "GRU"
best_pred = bert_pred if bert_metrics[0] > gru_metrics[0] else gru_pred
best_metrics = bert_metrics if bert_metrics[0] > gru_metrics[0] else gru_metrics

print(f"Best Model: {best_model_name}")
print(f"Accuracy: {best_metrics[0]:.4f}")
print("="*60)

## 6. Model Evaluation & Comparison 📊

In [None]:
# Load and fine-tune AfriBERTa model
print("Loading AfriBERTa model...")

bert_model = TFAutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=3
)

# Compile model
bert_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

# Train AfriBERTa
print("Training AfriBERTa model...")
bert_history = bert_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=3,
    verbose=0
)

print("✓ AfriBERTa training complete")

# Plot AfriBERTa training history
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].plot(bert_history.history['accuracy'], label='Train Accuracy')
axes[0].plot(bert_history.history['val_accuracy'], label='Val Accuracy')
axes[0].set_title('AfriBERTa Model Accuracy', fontweight='bold')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy')
axes[0].legend()
axes[0].grid(alpha=0.3)

axes[1].plot(bert_history.history['loss'], label='Train Loss')
axes[1].plot(bert_history.history['val_loss'], label='Val Loss')
axes[1].set_title('AfriBERTa Model Loss', fontweight='bold')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Prepare data for AfriBERTa (using transformer tokenizer)
print("Preparing data for AfriBERTa...")

# Use a small pretrained model for fast training
MODEL_NAME = "xlm-roberta-base"
tokenizer_bert = AutoTokenizer.from_pretrained(MODEL_NAME)

# Tokenize texts for BERT
def tokenize_function(texts):
    return tokenizer_bert(
        texts, 
        padding="max_length", 
        truncation=True, 
        max_length=128,
        return_tensors="tf"
    )

# Tokenize train, val, and test sets
print("Tokenizing datasets...")
train_encodings = tokenize_function(X_train.tolist())
val_encodings = tokenize_function(X_val.tolist())
test_encodings = tokenize_function(X_test.tolist())

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings), 
    y_train
)).batch(16).shuffle(100)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings), 
    y_val
)).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings), 
    y_test
)).batch(16)

print("✓ Data prepared for AfriBERTa")

## 5. AfriBERTa Fine-Tuning ⚡

In [None]:
# Build and train GRU model
print("Building GRU model...")

gru_model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=64, input_length=MAX_LEN),
    GRU(units=64, return_sequences=False),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')
])

gru_model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

print(gru_model.summary())

# Train GRU model
print("\nTraining GRU model...")
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

gru_history = gru_model.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=5,
    batch_size=32,
    callbacks=[early_stop],
    verbose=0
)

print("✓ GRU training complete")

# Plot GRU training history
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].plot(gru_history.history['accuracy'], label='Train Accuracy')
axes[0].plot(gru_history.history['val_accuracy'], label='Val Accuracy')
axes[0].set_title('GRU Model Accuracy', fontweight='bold')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy')
axes[0].legend()
axes[0].grid(alpha=0.3)

axes[1].plot(gru_history.history['loss'], label='Train Loss')
axes[1].plot(gru_history.history['val_loss'], label='Val Loss')
axes[1].set_title('GRU Model Loss', fontweight='bold')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 4. GRU Model Implementation 🧠

In [None]:
# Text cleaning function
def clean_text(text):
    """Remove URLs, mentions, and special characters"""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove @mentions
    text = re.sub(r'#', '', text)  # Remove hashtags
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # Remove special characters
    text = text.strip()
    return text

# Swahili stopwords
swahili_stopwords = {
    'na', 'ni', 'kwa', 'za', 'wa', 'kwamba', 'ile', 'hii', 'hiyo', 'yule',
    'mwenyewe', 'wao', 'zao', 'kwenye', 'karibu', 'pamoja', 'sana', 'tu',
    'zaidi', 'kidogo', 'tena', 'kila', 'nyingine', 'ingine', 'pia', 'au',
    'lakini', 'kama', 'ikiwa', 'akiwa', 'wakati', 'kabla', 'baada', 'tangu'
}

def remove_stopwords(text):
    """Remove Swahili stopwords"""
    tokens = text.split()
    tokens = [t for t in tokens if t.lower() not in swahili_stopwords]
    return ' '.join(tokens)

# Apply preprocessing
print("Preprocessing text...")
train_data['text_cleaned'] = train_data['text'].apply(lambda x: clean_text(str(x)))
train_data['text_cleaned'] = train_data['text_cleaned'].apply(remove_stopwords)

print("✓ Text cleaned and stopwords removed")
print(f"\nExample cleaned texts:")
for i in range(2):
    print(f"  Original: {train_data['text'].iloc[i][:80]}...")
    print(f"  Cleaned:  {train_data['text_cleaned'].iloc[i][:80]}...")
    print()

## 3. Preprocessing Pipeline 🧼

In [None]:
# EDA: Dataset statistics and visualization
print("="*60)
print("DATASET STATISTICS")
print("="*60)

# Text length statistics
train_data['text_length'] = train_data['text'].apply(lambda x: len(str(x).split()))

print(f"\nText Length Statistics:")
print(f"  Mean: {train_data['text_length'].mean():.1f} words")
print(f"  Median: {train_data['text_length'].median():.1f} words")
print(f"  Max: {train_data['text_length'].max()} words")
print(f"  Min: {train_data['text_length'].min()} words")

# Label distribution
print(f"\nLabel Distribution:")
label_counts = train_data['label'].value_counts().sort_index()
sentiment_names = ['Negative', 'Neutral', 'Positive']
for label_idx, count in label_counts.items():
    pct = 100 * count / len(train_data)
    print(f"  {sentiment_names[label_idx]:10} (Label {label_idx}): {int(count):5} ({pct:5.1f}%)")

print(f"\nTotal samples: {len(train_data)}")

# Visualize label distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

# Bar chart
colors = ['#FF6B6B', '#FFA500', '#4ECB71']
axes[0].bar(sentiment_names, label_counts, color=colors, alpha=0.7, edgecolor='black')
axes[0].set_title('Sentiment Label Distribution', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Number of Samples')
axes[0].grid(axis='y', alpha=0.3)

# Pie chart
axes[1].pie(label_counts, labels=sentiment_names, autopct='%1.1f%%', colors=colors, startangle=90)
axes[1].set_title('Sentiment Distribution (%)', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

# Text length distribution
fig, ax = plt.subplots(figsize=(10, 5))
ax.hist(train_data['text_length'], bins=50, color='#4ECDC4', alpha=0.7, edgecolor='black')
ax.axvline(train_data['text_length'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {train_data["text_length"].mean():.1f}')
ax.axvline(train_data['text_length'].median(), color='green', linestyle='--', linewidth=2, label=f'Median: {train_data["text_length"].median():.1f}')
ax.set_title('Distribution of Text Lengths', fontsize=12, fontweight='bold')
ax.set_xlabel('Number of Words')
ax.set_ylabel('Frequency')
ax.legend()
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()

print("\n✓ EDA complete")

## 2. Exploratory Data Analysis (EDA) 📊

In [None]:
# Import libraries
import warnings
warnings.filterwarnings('ignore')

import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, pipeline
import numpy as np
from tqdm import tqdm

# Set seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print("✓ Libraries imported successfully")

In [None]:
# Install required packages
import subprocess
import sys

packages = [
    'datasets', 'transformers', 'tensorflow', 'scikit-learn',
    'matplotlib', 'seaborn', 'pandas', 'numpy', 'tqdm'
]

for package in packages:
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])
    except:
        pass

print("✓ All packages installed")

## 1. Setup and Data Loading 📥