 ## Swahili Sentiment Analysis
 Classifying Swahili tweets into positive, negative, and neutral sentiments.
 This solution fine-tunes a pre-trained RoBERTa (**WECHSEL-Swahili**) model using the transformers library.

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight
from tensorflow.keras.layers import Dense, Input, Dropout, GlobalMaxPooling1D, GlobalAveragePooling1D, Concatenate
from tensorflow.keras.losses import Loss
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from transformers import AutoTokenizer, TFAutoModel
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from sklearn.metrics import confusion_matrix, classification_report
from joblib import dump, load
from transformers import TFRobertaModel
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Bidirectional, LSTM, BatchNormalization

# Custom loss function with label smoothing
class SparseCategoricalCrossentropyWithLabelSmoothing(Loss):
    
    # Constructor
    def __init__(self, label_smoothing=0.1, name="sparse_categorical_crossentropy_with_label_smoothing"):
        super().__init__(name=name)
        self.label_smoothing = label_smoothing

    # Define the loss calculation
    def call(self, y_true, y_pred):
        # Convert true labels to one-hot encoding
        labels = tf.one_hot(tf.squeeze(tf.cast(y_true, tf.int32)), depth=y_pred.shape[-1])
        # Apply categorical cross-entropy loss with label smoothing
        return tf.keras.losses.categorical_crossentropy(labels, y_pred, label_smoothing=self.label_smoothing)

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Load the training data
data = pd.read_csv('train.csv')

# Encode sentiment labels using LabelEncoder
encoder = LabelEncoder()
data['Labels'] = encoder.fit_transform(data['Labels'])
# Save the label encoder for later use
dump(encoder, 'sw-encoder.joblib')

# List of pre-trained model names
models = ['benjamin/roberta-base-wechsel-swahili']

# Split the data into training and holdout sets
train_data, holdout_data = train_test_split(data, test_size=0.1, random_state=42)

# Iterate over pre-trained models
for i, model_name in enumerate(models):
    
    # Define early stopping and model checkpoint callbacks
    early_stopping = EarlyStopping(monitor='val_loss', patience=3)
    model_checkpoint = ModelCheckpoint(f'sw-best_model_{i}.h5', monitor='val_loss', save_best_only=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, min_lr=1e-7, verbose=1)
    
    # Load the tokenizer and pre-trained model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = TFAutoModel.from_pretrained(model_name)
    # Save the tokenizer for later use
    tokenizer.save_pretrained(f'sw-tokenizer_{i}')
    
    # Tokenize input data using the pre-trained tokenizer
    inputs = tokenizer(train_data['Tweets'].to_list(), return_tensors='tf', padding=True, truncation=True, max_length=512)
    labels = train_data['Labels'].to_numpy()

    # Convert tokenized input to NumPy arrays
    input_ids = inputs['input_ids'].numpy()
    attention_mask = inputs['attention_mask'].numpy()

    # Perform Stratified K-Fold cross-validation
    
    # Initialize Stratified K-Fold cross-validator
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    for fold, (train_index, val_index) in enumerate(skf.split(input_ids, labels)):        
                
        train_input_ids, val_input_ids = input_ids[train_index], input_ids[val_index]
        train_attention_mask, val_attention_mask = attention_mask[train_index], attention_mask[val_index]
        train_labels, val_labels = labels[train_index], labels[val_index]

        # Compute class weights and create dictionary
        class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
        
        # Define model architecture
        
        # Define input layers for model
        input_ids_layer = Input(shape=(None,), dtype=tf.int32, name='input_ids')
        attention_mask_layer = Input(shape=(None,), dtype=tf.int32, name='attention_mask')
        
        # Retrieve base output from pre-trained model
        base_output = model([input_ids_layer, attention_mask_layer])[0]

        # Adding dropout to the embeddings
        base_output = Dropout(0.2)(base_output)

        # Adding Bidirectional LSTM Layer
        base_output = Bidirectional(LSTM(64, return_sequences=True))(base_output)

        # Batch normalization for regularizing and speeding up training
        base_output = BatchNormalization()(base_output)

       # Pooling layers to capture different aspects of the data
        max_pool_output = GlobalMaxPooling1D()(base_output)  # Global Max Pooling to capture strongest features
        avg_pool_output = GlobalAveragePooling1D()(base_output)  # Global Average Pooling to capture overall features

        # Concatenate the outputs from max pooling and average pooling
        pooled_output = Concatenate()([max_pool_output, avg_pool_output])

        # Fully connected layer for classification with softmax activation and L2 regularization 
        output = Dense(3, activation='softmax', kernel_regularizer=regularizers.l2(0.02))(pooled_output)

        # Create a new model using input layers and the constructed output layer
        new_model = Model(inputs=[input_ids_layer, attention_mask_layer], outputs=output)

        # Compile the model with custom loss function
        optimizer = Adam(learning_rate=1e-5)
        loss = SparseCategoricalCrossentropyWithLabelSmoothing()
        metrics = ['accuracy']
        new_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

        # Train the model with model checkpoints, early stopping, and learning rate reduction for training optimization
        history = new_model.fit({'input_ids': train_input_ids, 'attention_mask': train_attention_mask},
                                train_labels, epochs=20,
                                validation_data=({'input_ids': val_input_ids, 'attention_mask': val_attention_mask}, val_labels),
                                class_weight=class_weights_dict,
                                callbacks=[early_stopping, model_checkpoint, reduce_lr])

        # Load the best model weights
        new_model.load_weights(f'sw-best_model_{i}.h5')

        # Save trained fold model
        new_model.save(f'sw-model_{i}_fold_{fold}.keras')

Evaluate the model's performance on the holdout data by loading the trained model, tokenizing the holdout data, making predictions, and then printing the confusion matrix and classification report.

In [None]:
# Define the custom loss function with label smoothing with reduction set to AUTO
class SparseCategoricalCrossentropyWithLabelSmoothing(tf.keras.losses.Loss):
    def __init__(self, label_smoothing=0.1, reduction=tf.keras.losses.Reduction.AUTO, name="sparse_categorical_crossentropy_with_label_smoothing"):
        super().__init__(reduction=reduction, name=name)
        self.label_smoothing = label_smoothing

    def call(self, y_true, y_pred):
        # Convert true labels to one-hot encoding
        labels = tf.one_hot(tf.squeeze(tf.cast(y_true, tf.int32)), depth=y_pred.shape[-1])
        # Apply categorical cross-entropy loss with label smoothing
        return tf.keras.losses.categorical_crossentropy(labels, y_pred, label_smoothing=self.label_smoothing)

# Define the custom_objects dictionary for model loading
custom_objects = {
    "SparseCategoricalCrossentropyWithLabelSmoothing": SparseCategoricalCrossentropyWithLabelSmoothing,
    'TFRobertaModel': TFRobertaModel  # TFRobertaModel is a required custom object
}

# Load the last saved model using custom_objects
model = tf.keras.models.load_model(
    f'sw-model_0_fold_9.keras',
    custom_objects=custom_objects
)

# Load the tokenizer used during training
tokenizer = AutoTokenizer.from_pretrained(f'sw-tokenizer_0')

# Tokenize and predict on holdout data
inputs = tokenizer(holdout_data['Tweets'].to_list(), return_tensors='tf', padding=True, truncation=True, max_length=512)
input_ids = inputs['input_ids'].numpy()
attention_mask = inputs['attention_mask'].numpy()
predictions = model.predict({'input_ids': input_ids, 'attention_mask': attention_mask})

# Convert output probabilities to class labels
predictions = np.argmax(predictions, axis=-1)

# Print the confusion matrix
print(confusion_matrix(holdout_data['Labels'], predictions))

# Print the classification report
print(classification_report(holdout_data['Labels'], predictions))

Predict labels for the test data, transform the predictions back to their original labels, create and save a CSV file for submission.

In [None]:
# Define the custom loss function with label smoothing with reduction set to AUTO
class SparseCategoricalCrossentropyWithLabelSmoothing(tf.keras.losses.Loss):
    def __init__(self, label_smoothing=0.1, reduction=tf.keras.losses.Reduction.AUTO, name="sparse_categorical_crossentropy_with_label_smoothing"):
        super().__init__(reduction=reduction, name=name)
        self.label_smoothing = label_smoothing

    def call(self, y_true, y_pred):
        # Convert true labels to one-hot encoding
        labels = tf.one_hot(tf.squeeze(tf.cast(y_true, tf.int32)), depth=y_pred.shape[-1])
        # Apply categorical cross-entropy loss with label smoothing
        return tf.keras.losses.categorical_crossentropy(labels, y_pred, label_smoothing=self.label_smoothing)

# Define the custom_objects dictionary for model loading
custom_objects = {
    "SparseCategoricalCrossentropyWithLabelSmoothing": SparseCategoricalCrossentropyWithLabelSmoothing,
    'TFRobertaModel': TFRobertaModel  # TFRobertaModel is a required custom object
}

# Load the last saved model using custom_objects
model = tf.keras.models.load_model(
    f'sw-model_0_fold_9.keras',
    custom_objects=custom_objects
)

# Load the test data
test_data = pd.read_csv('test.csv')

# Load the tokenizer used during training
tokenizer = AutoTokenizer.from_pretrained(f'sw-tokenizer_0')

# Tokenize the test data
inputs = tokenizer(test_data['Tweets'].to_list(), return_tensors='tf', padding=True, truncation=True, max_length=512)

# Make predictions on the test data
output = model.predict({'input_ids': inputs['input_ids'].numpy(), 'attention_mask': inputs['attention_mask'].numpy()})

# Take argmax to get predicted class
test_predictions = np.argmax(output, axis=-1)

# Load the encoder used during training
encoder = load('sw-encoder.joblib')

# Reverse-transform the predicted labels
test_predictions = encoder.inverse_transform(test_predictions)

# Prepare the submission DataFrame
submission = pd.DataFrame()
submission['ID'] = test_data['ID']  # Make sure the column names match
submission['Labels'] = test_predictions

# Save the submission as a CSV file
submission.to_csv('submission.csv', index=False)