In [86]:
import pandas as pd
import re
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer
import torch

AFRIKAANS

In [87]:
#### AFRIKAANS ####
# Login using e.g. `huggingface-cli login` to access this dataset
# Split the dataset
splits = {'train': 'afr/train-00000-of-00001.parquet', 'dev': 'afr/dev-00000-of-00001.parquet', 'test': 'afr/test-00000-of-00001.parquet'}

# Training df
df_train_afr = pd.read_parquet("hf://datasets/brighter-dataset/BRIGHTER-emotion-categories/" + splits["train"])
df_train_afr.head()

# Dev df
df_dev_afr = pd.read_parquet("hf://datasets/brighter-dataset/BRIGHTER-emotion-categories/" + splits["dev"])
df_dev_afr.head()

# Testing df
df_test_afr = pd.read_parquet("hf://datasets/brighter-dataset/BRIGHTER-emotion-categories/" + splits["test"])
df_test_afr.head()

Unnamed: 0,id,text,anger,disgust,fear,joy,sadness,surprise,emotions
0,afr_test_track_a_00001,as regering is ons daartoe verbind om ons deel...,0,0,0,1,0,,[joy]
1,afr_test_track_a_00002,op die oomblik is die kwessie van voedselsekur...,0,0,1,0,1,,"[fear, sadness]"
2,afr_test_track_a_00003,ek hoor dikwels mense sê hulle is gereed om be...,0,0,0,0,0,,[]
3,afr_test_track_a_00004,hiervan bly kindermishandeling waarskynlik een...,1,1,0,0,1,,"[anger, disgust, sadness]"
4,afr_test_track_a_00005,so gaan ons ernstig kyk na die kwaliteit van o...,0,0,0,0,0,,[]


In [88]:
#### AFRIKAANS ####
# Structure of the dataset
print(df_train_afr.columns)
print(df_train_afr.iloc[0])

Index(['id', 'text', 'anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise',
       'emotions'],
      dtype='object')
id                                    afr_train_track_a_00001
text        die grondeienaars het die departement genader ...
anger                                                       0
disgust                                                     0
fear                                                        0
joy                                                         0
sadness                                                     0
surprise                                                  NaN
emotions                                                   []
Name: 0, dtype: object


In [89]:
#### SWAHILI ####
# Check for empty rows
empty_text_rows = df_train_afr['text'].str.strip().eq('')
print(f"Empty text rows: {empty_text_rows.sum()}")

# Check for NaN
print(f"NaN: {df_train_afr.isnull().sum()}")

# Fill 'surprise' column with '0'
df_train_afr['surprise'] = df_train_afr['surprise'].fillna(0)
df_dev_afr['surprise'] = df_dev_afr['surprise'].fillna(0)
df_test_afr['surprise'] = df_test_afr['surprise'].fillna(0)

Empty text rows: 0
NaN: id             0
text           0
anger          0
disgust        0
fear           0
joy            0
sadness        0
surprise    1222
emotions       0
dtype: int64


In [90]:
#### AFRIKAANS ####
# Remove duplicates based on 'text' column
print("Duplicate texts in train:", df_train_afr['text'].duplicated().sum())
print("Duplicate texts in dev:", df_dev_afr['text'].duplicated().sum())
print("Duplicate texts in test:", df_test_afr['text'].duplicated().sum())

# Keep only first occurence of duplicate
df_train_afr = df_train_afr.drop_duplicates(subset='text')
df_dev_afr = df_dev_afr.drop_duplicates(subset='text')
df_test_afr = df_test_afr.drop_duplicates(subset='text')

Duplicate texts in train: 0
Duplicate texts in dev: 98
Duplicate texts in test: 1065


In [91]:
#### AFRIKAANS ####
# Remove punctuation and spaces
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_train_afr['text'] = df_train_afr['text'].astype(str).apply(clean_text)
df_dev_afr['text'] = df_dev_afr['text'].astype(str).apply(clean_text)
df_test_afr['text'] = df_test_afr['text'].astype(str).apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dev_afr['text'] = df_dev_afr['text'].astype(str).apply(clean_text)


In [92]:
#### AFRIKAANS ####
# Convert multi-labels to binary vectors
mlb = MultiLabelBinarizer()

def safe_eval(x):
    try:
        return eval(x)
    except Exception:
        return []

df_train_afr['labels'] = df_train_afr['emotions'].apply(safe_eval)
df_dev_afr['labels'] = df_dev_afr['emotions'].apply(safe_eval)
df_test_afr['labels'] = df_test_afr['emotions'].apply(safe_eval)

y_train_afr = mlb.fit_transform(df_train_afr['labels'])
y_dev_afr = mlb.transform(df_dev_afr['labels'])
y_test_afr = mlb.transform(df_test_afr['labels'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dev_afr['labels'] = df_dev_afr['emotions'].apply(safe_eval)


In [93]:
#### AFRIKAANS ####
# Multi-label class names
label_classes_afr = mlb.classes_
print(label_classes_afr)

# Tokenize with mBert
tokenizer_afr = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

def tokenize_texts(texts):
    return tokenizer_afr(texts, padding="max_length", truncation=True, max_length=128)

train_encodings_afr = tokenize_texts(df_train_afr['text'].tolist())
dev_encodings_afr = tokenize_texts(df_dev_afr['text'].tolist())
test_encodings_afr = tokenize_texts(df_test_afr['text'].tolist())

[]


In [94]:
#### AFRIKAANS ####
from datasets import Dataset
# Convert clean and preprocessed df into Hugging face dataset format
train_dataset_afr = Dataset.from_dict({
    'input_ids': train_encodings_afr['input_ids'],
    'attention_mask': train_encodings_afr['attention_mask'],
    'labels': y_train_afr.tolist()
})

dev_dataset_afr = Dataset.from_dict({
    'input_ids': dev_encodings_afr['input_ids'],
    'attention_mask': dev_encodings_afr['attention_mask'],
    'labels': y_dev_afr.tolist()
})

test_dataset_afr = Dataset.from_dict({
    'input_ids': test_encodings_afr['input_ids'],
    'attention_mask': test_encodings_afr['attention_mask'],
    'labels': y_test_afr.tolist()
})

SWAHILI

In [95]:
#### SWAHILI ####
# Split the data
# Login using e.g. `huggingface-cli login` to access this dataset
splits = {'train': 'swa/train-00000-of-00001.parquet', 'dev': 'swa/dev-00000-of-00001.parquet', 'test': 'swa/test-00000-of-00001.parquet'}

# Training df
df_train_swa = pd.read_parquet("hf://datasets/brighter-dataset/BRIGHTER-emotion-categories/" + splits["train"])

# Dev df
df_dev_swa = pd.read_parquet("hf://datasets/brighter-dataset/BRIGHTER-emotion-categories/" + splits["dev"])

# Testing df
df_test_swa = pd.read_parquet("hf://datasets/brighter-dataset/BRIGHTER-emotion-categories/" + splits["test"])


In [96]:
#### SWAHILI ####
# Structure of the dataset
print(df_train_swa.columns)
print(df_train_swa.iloc[0])

Index(['id', 'text', 'anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise',
       'emotions'],
      dtype='object')
id                               swa_train_track_a_00001
text        hii game ni goals zimetukataa tu kumamaye 😭💔
anger                                                  1
disgust                                                0
fear                                                   0
joy                                                    0
sadness                                                0
surprise                                               0
emotions                                         [anger]
Name: 0, dtype: object


In [97]:
#### SWAHILI ####
# Check for empty rows
empty_text_rows = df_train_swa['text'].str.strip().eq('')
print(f"Empty text rows: {empty_text_rows.sum()}")

# Check for NaN
print(f"NaN: {df_train_swa.isnull().sum()}")

Empty text rows: 0
NaN: id          0
text        0
anger       0
disgust     0
fear        0
joy         0
sadness     0
surprise    0
emotions    0
dtype: int64


In [98]:
#### SWAHILI ####
# Check for duplicate texts
print("Duplicate texts in Swahili train:", df_train_swa['text'].duplicated().sum())
print("Duplicate texts in Swahili dev:", df_dev_swa['text'].duplicated().sum())
print("Duplicate texts in Swahili test:", df_test_swa['text'].duplicated().sum())

# Keep first occurence of duplicate
df_train_swa = df_train_swa.drop_duplicates(subset='text')
df_dev_swa = df_dev_swa.drop_duplicates(subset='text')
df_test_swa = df_test_swa.drop_duplicates(subset='text')

Duplicate texts in Swahili train: 27
Duplicate texts in Swahili dev: 551
Duplicate texts in Swahili test: 1664


In [99]:
#### SWAHILI ####
# Remove punctuation and spaces
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_train_swa['text'] = df_train_swa['text'].astype(str).apply(clean_text)
df_dev_swa['text'] = df_dev_swa['text'].astype(str).apply(clean_text)
df_test_swa['text'] = df_test_swa['text'].astype(str).apply(clean_text)

In [100]:
#### SWAHILI ####
# Convert multi-labels to binary vectors
df_train_swa['labels'] = df_train_swa['emotions']
df_dev_swa['labels'] = df_dev_swa['emotions']
df_test_swa['labels'] = df_test_swa['emotions']


mlb = MultiLabelBinarizer()
y_train_swa = mlb.fit_transform(df_train_swa['labels'])
y_dev_swa = mlb.transform(df_dev_swa['labels'])
y_test_swa = mlb.transform(df_test_swa['labels'])

label_classes_swa = mlb.classes_
print("Swahili label classes:", label_classes_swa)

Swahili label classes: ['anger' 'disgust' 'fear' 'joy' 'sadness' 'surprise']


In [101]:
#### SWAHILI ####
# Tokenization with mBERT
tokenizer_swa = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

def tokenize_texts(texts):
    return tokenizer_swa(texts, padding="max_length", truncation=True, max_length=128)

train_encodings_swa = tokenize_texts(df_train_swa['text'].tolist())
dev_encodings_swa = tokenize_texts(df_dev_swa['text'].tolist())
test_encodings_swa = tokenize_texts(df_test_swa['text'].tolist())


In [102]:
#### SWAHILI ####
# Convert clean and preprocessed df into Hugging face dataset format
train_dataset_swa = Dataset.from_dict({
    'input_ids': train_encodings_swa['input_ids'],
    'attention_mask': train_encodings_swa['attention_mask'],
    'labels': y_train_swa.tolist()
})

dev_dataset_swa = Dataset.from_dict({
    'input_ids': dev_encodings_swa['input_ids'],
    'attention_mask': dev_encodings_swa['attention_mask'],
    'labels': y_dev_swa.tolist()
})

test_dataset_swa = Dataset.from_dict({
    'input_ids': test_encodings_swa['input_ids'],
    'attention_mask': test_encodings_swa['attention_mask'],
    'labels': y_test_swa.tolist()
})

ENGLISH

In [103]:
#### ENGLISH ####
# Login using e.g. `huggingface-cli login` to access this dataset
# Split the dataset

splits = {
    'train': 'eng/train-00000-of-00001.parquet',
    'dev': 'eng/dev-00000-of-00001.parquet',
    'test': 'eng/test-00000-of-00001.parquet'
}

# Training df
df_train_eng = pd.read_parquet("hf://datasets/brighter-dataset/BRIGHTER-emotion-categories/" + splits["train"])

# Dev df
df_dev_eng = pd.read_parquet("hf://datasets/brighter-dataset/BRIGHTER-emotion-categories/" + splits["dev"])

#Testing df
df_test_eng = pd.read_parquet("hf://datasets/brighter-dataset/BRIGHTER-emotion-categories/" + splits["test"])


In [104]:
#### ENGLISH ####
empty_text_rows = df_train_eng['text'].str.strip().eq('')
print(f"Empty text rows: {empty_text_rows.sum()}")

# Check for NaN
print(f"NaN: {df_train_eng.isnull().sum()}")

# Fill 'disgust' column with '0'
df_train_eng['disgust'] = df_train_eng['disgust'].fillna(0)
df_dev_eng['disgust'] = df_dev_eng['disgust'].fillna(0)
df_test_eng['disgust'] = df_test_eng['disgust'].fillna(0)

Empty text rows: 0
NaN: id             0
text           0
anger          0
disgust     2768
fear           0
joy            0
sadness        0
surprise       0
emotions       0
dtype: int64


In [105]:
#### ENGLISH ####
# Check for duplicate texts
print("Duplicate texts in English train:", df_train_eng['text'].duplicated().sum())
print("Duplicate texts in English dev:", df_dev_eng['text'].duplicated().sum())
print("Duplicate texts in English test:", df_test_eng['text'].duplicated().sum())

# Keep first occurence of duplicate
df_train_eng = df_train_eng.drop_duplicates(subset='text')
df_dev_eng = df_dev_eng.drop_duplicates(subset='text')
df_test_eng = df_test_eng.drop_duplicates(subset='text')

Duplicate texts in English train: 4
Duplicate texts in English dev: 116
Duplicate texts in English test: 2772


In [106]:
#### ENGLISH ####
# Remove punctuation and spaces
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_train_eng['text'] = df_train_eng['text'].astype(str).apply(clean_text)
df_dev_eng['text'] = df_dev_eng['text'].astype(str).apply(clean_text)
df_test_eng['text'] = df_test_eng['text'].astype(str).apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_eng['text'] = df_train_eng['text'].astype(str).apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dev_eng['text'] = df_dev_eng['text'].astype(str).apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_eng['text'] = df_test_eng['text'].astype(str).apply(c

In [107]:
#### ENGLISH ####
# Convert multi-labels to binary vectors
df_train_eng['labels'] = df_train_eng['emotions']
df_dev_eng['labels'] = df_dev_eng['emotions']
df_test_eng['labels'] = df_test_eng['emotions']

mlb = MultiLabelBinarizer()

y_train_eng = mlb.fit_transform(df_train_eng['labels'])
y_dev_eng = mlb.transform(df_dev_eng['labels'])
y_test_eng = mlb.transform(df_test_eng['labels'])

print("English label classes:", mlb.classes_)

English label classes: ['anger' 'fear' 'joy' 'sadness' 'surprise']


In [108]:
#### ENGLISH ####
# Tokenize texts using mBERT tokenizer
tokenizer_eng = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

def tokenize_texts(texts):
    return tokenizer_eng(texts, padding="max_length", truncation=True, max_length=128)

train_encodings_eng = tokenize_texts(df_train_eng['text'].tolist())
dev_encodings_eng = tokenize_texts(df_dev_eng['text'].tolist())
test_encodings_eng = tokenize_texts(df_test_eng['text'].tolist())

In [110]:
#### ENGLISH ####
# Convert clean and preprocessed df into Hugging face dataset format
train_dataset_eng = Dataset.from_dict({
    'input_ids': train_encodings_eng['input_ids'],
    'attention_mask': train_encodings_eng['attention_mask'],
    'labels': y_train_eng.tolist()
})

dev_dataset_eng = Dataset.from_dict({
    'input_ids': dev_encodings_eng['input_ids'],
    'attention_mask': dev_encodings_eng['attention_mask'],
    'labels': y_dev_eng.tolist()
})

test_dataset_eng = Dataset.from_dict({
    'input_ids': test_encodings_eng['input_ids'],
    'attention_mask': test_encodings_eng['attention_mask'],
    'labels': y_test_eng.tolist()
})

# **Cross-Lingual Transfer**

In [112]:
print("Sample raw emotions:", df_train_eng['emotions'].head(10).tolist())
print("Types:", df_train_eng['emotions'].apply(type).value_counts())


Sample raw emotions: [array(['fear', 'surprise'], dtype=object), array(['fear'], dtype=object), array(['fear', 'sadness'], dtype=object), array([], dtype=object), array(['fear', 'sadness', 'surprise'], dtype=object), array(['fear', 'surprise'], dtype=object), array(['anger', 'fear'], dtype=object), array(['fear', 'sadness'], dtype=object), array(['fear'], dtype=object), array(['joy'], dtype=object)]
Types: emotions
<class 'numpy.ndarray'>    2764
Name: count, dtype: int64


In [115]:
# -----------------------------------------
# Import necessary libraries
# -----------------------------------------
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
import torch
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer
import ast

# -----------------------------------------
# Load and preprocess data
# -----------------------------------------

# Create an instance of MultiLabelBinarizer to one-hot encode label lists
mlb = MultiLabelBinarizer()

# Define a safe version of eval to parse stringified lists in the 'emotions' column
def safe_eval(x):
    try:
        return ast.literal_eval(x) if isinstance(x, str) else []
    except Exception:
        return []

# Apply label parsing and assignment to each dataframe
# The 'emotions' column is assumed to contain stringified lists like "['joy', 'surprise']"
# This maps those lists directly into a new 'labels' column (can apply `safe_eval` here if needed)
for df in [df_train_eng, df_dev_eng, df_test_eng,
           df_train_afr, df_dev_afr, df_test_afr,
           df_train_swa, df_dev_swa, df_test_swa]:
    df['labels'] = df['emotions']  # Replace with: df['labels'] = df['emotions'].apply(safe_eval) if parsing is needed

# Sanity check to confirm correct label format after assignment
print("Sample parsed labels:", df_train_eng['labels'].head())

# -----------------------------------------
# Binarize labels (one-hot encoding for multi-label classification)
# -----------------------------------------

# Fit the binarizer on English training labels only
# This establishes the class order and builds the one-hot encoding matrix
y_train_eng = mlb.fit_transform(df_train_eng['labels']).astype(np.float32)

# Transform other sets using the same class mapping
y_dev_eng = mlb.transform(df_dev_eng['labels']).astype(np.float32)
y_test_eng = mlb.transform(df_test_eng['labels']).astype(np.float32)
y_test_afr = mlb.transform(df_test_afr['labels']).astype(np.float32)
y_test_swa = mlb.transform(df_test_swa['labels']).astype(np.float32)

# Confirm output shapes and class mapping
print("Classes:", mlb.classes_)  # e.g., ['anger' 'fear' 'joy' 'sadness' 'surprise']
print("y_train_eng shape:", y_train_eng.shape)  # (num_samples, num_classes)

# -----------------------------------------
# Tokenization (text → input_ids, attention_mask, etc.)
# -----------------------------------------

# Load a pre-trained multilingual BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# Tokenization function that applies truncation and padding to 128 tokens
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

# Apply tokenization to each language-specific dataset
train_encodings = tokenize_function(df_train_eng["text"].tolist())
dev_encodings = tokenize_function(df_dev_eng["text"].tolist())
test_encodings_eng = tokenize_function(df_test_eng["text"].tolist())
test_encodings_afr = tokenize_function(df_test_afr["text"].tolist())
test_encodings_swa = tokenize_function(df_test_swa["text"].tolist())

# -----------------------------------------
# Create a PyTorch Dataset class to use with the Trainer
# -----------------------------------------

class EmotionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.float32)  # Ensure float for multi-label BCE loss

    def __getitem__(self, idx):
        # Gather individual sample's encoded input
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]  # Add the corresponding one-hot encoded label
        return item

    def __len__(self):
        return len(self.labels)

# Instantiate datasets
train_dataset_eng = EmotionDataset(train_encodings, y_train_eng)
dev_dataset_eng = EmotionDataset(dev_encodings, y_dev_eng)
test_dataset_eng = EmotionDataset(test_encodings_eng, y_test_eng)
test_dataset_afr = EmotionDataset(test_encodings_afr, y_test_afr)
test_dataset_swa = EmotionDataset(test_encodings_swa, y_test_swa)

# -----------------------------------------
# Load the model for sequence classification
# -----------------------------------------

# Load a multilingual BERT model with:
# - Correct number of output labels
# - Classification type explicitly set to multi-label
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=len(mlb.classes_),               # Number of output nodes = number of emotion classes
    problem_type="multi_label_classification"   # Important for correct loss function (BCEWithLogits)
)


Sample parsed labels: 0             [fear, surprise]
1                       [fear]
2              [fear, sadness]
3                           []
4    [fear, sadness, surprise]
Name: labels, dtype: object
Classes: ['anger' 'fear' 'joy' 'sadness' 'surprise']
y_train_eng shape: (2764, 5)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [117]:
#====TESTING====


# Print label-related info
print("Classes:", mlb.classes_)
print("Shape of y_train_eng:", y_train_eng.shape)
print("Sample binarized labels:", y_train_eng[:5])
print("Sample original labels:", df_train_eng['labels'].head(5).tolist())

# Print tokenization info
print("Sample tokenized input keys:", train_encodings.keys())
print("Example tokenized input IDs for first sample:", train_encodings['input_ids'][0])
print("Length of first tokenized input:", len(train_encodings['input_ids'][0]))

# Check one item from the dataset
sample_item = train_dataset_eng[0]
print("Sample dataset item keys:", sample_item.keys())
print("Labels tensor shape:", sample_item['labels'].shape)
print("Labels tensor:", sample_item['labels'])

# Prepare a batch for testing the model
batch = {
    key: torch.tensor(val[:2]) for key, val in train_encodings.items()
}

# Put model in evaluation mode and run inference
model.eval()
with torch.no_grad():
    outputs = model(
        input_ids=batch['input_ids'],
        attention_mask=batch['attention_mask'],
        token_type_ids=batch.get('token_type_ids')  # safely include if exists
    )
    logits = outputs.logits

# Print logits info
print("Output logits shape:", logits.shape)
print("Output logits (sample):", logits)


Classes: ['anger' 'fear' 'joy' 'sadness' 'surprise']
Shape of y_train_eng: (2764, 5)
Sample binarized labels: [[0. 1. 0. 0. 1.]
 [0. 1. 0. 0. 0.]
 [0. 1. 0. 1. 0.]
 [0. 0. 0. 0. 0.]
 [0. 1. 0. 1. 1.]]
Sample original labels: [array(['fear', 'surprise'], dtype=object), array(['fear'], dtype=object), array(['fear', 'sadness'], dtype=object), array([], dtype=object), array(['fear', 'sadness', 'surprise'], dtype=object)]
Sample tokenized input keys: dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
Example tokenized input IDs for first sample: [101, 14136, 11272, 20181, 10108, 11858, 57204, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Length of first tokenized inpu

# Adapter-Based Fine-Tuning

In [122]:
from adapters import ConfigUnion, PrefixTuningConfig, ParBnConfig, AutoAdapterModel
from transformers import AutoTokenizer

# Example label list
labels = ["joy", "anger", "sadness", "surprise", "fear"]
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for i, l in enumerate(labels)}

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# Use AutoAdapterModel (instead of AutoModelForSequenceClassification) to support adapters
model = AutoAdapterModel.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=len(labels),
    problem_type="multi_label_classification",
    id2label=id2label,
    label2id=label2id,
)

# Combine adapter configs with ConfigUnion
adapter_config = ConfigUnion(
    PrefixTuningConfig(prefix_length=20),
    ParBnConfig(reduction_factor=4),
)

# Add adapter with combined config and activate immediately
model.add_adapter("my_adapter", config=adapter_config, set_active=True)

# Put model into adapter training mode for the adapter
model.train_adapter("my_adapter")

# Optionally explicitly set active adapters (not necessary if set_active=True above)
model.set_active_adapters("my_adapter")


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [124]:
import torch
from adapters import ConfigUnion, PrefixTuningConfig, ParBnConfig, AutoAdapterModel
from transformers import AutoTokenizer

# === Setup ===
labels = ["joy", "anger", "sadness", "surprise", "fear"]
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for i, l in enumerate(labels)}

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

model = AutoAdapterModel.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=len(labels),
    problem_type="multi_label_classification",
    id2label=id2label,
    label2id=label2id,
)

adapter_config = ConfigUnion(
    PrefixTuningConfig(prefix_length=20),
    ParBnConfig(reduction_factor=4),
)

model.add_adapter("my_adapter", config=adapter_config, set_active=True)
model.train_adapter("my_adapter")
model.set_active_adapters("my_adapter")

# === Verification ===
print("\n=== VERIFICATION CHECKS ===")

# Check if adapter config is attached properly
has_adapters = hasattr(model.config, "adapters")
print("Config has adapters attribute:", has_adapters)

if has_adapters:
    print("Available adapters:", model.config.adapters.adapter_list())
else:
    print("No adapters found in config!")

print("Active adapters:", model.active_adapters)

# Check which parameters are trainable (should be adapter params)
trainable_params = [name for name, param in model.named_parameters() if "adapters.my_adapter" in name and param.requires_grad]
print(f"\nTrainable adapter parameters ({len(trainable_params)}):")
for name in trainable_params:
    print("  -", name)

# Check frozen base model parameters
frozen_params = [name for name, param in model.named_parameters() if "adapters.my_adapter" not in name and not param.requires_grad]
print(f"\nNumber of frozen base parameters: {len(frozen_params)}")

# Check classifier head info
print("\nClassifier head info:")
print("  - problem_type:", model.config.problem_type)
print("  - label2id:", model.config.label2id)

# Run a sample forward pass
sample_text = "I am feeling joyful and surprised today!"
inputs = tokenizer(sample_text, return_tensors="pt", padding=True, truncation=True)
try:
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
    print("\nForward pass successful!")
    print("Logits shape:", outputs.logits.shape)
    print("Logits:", outputs.logits)
except Exception as e:
    print("Forward pass failed:", e)


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== VERIFICATION CHECKS ===
Config has adapters attribute: False
No adapters found in config!
Active adapters: Stack[my_adapter]

Trainable adapter parameters (48):
  - bert.encoder.layer.0.output.adapters.my_adapter.adapter_down.0.weight
  - bert.encoder.layer.0.output.adapters.my_adapter.adapter_down.0.bias
  - bert.encoder.layer.0.output.adapters.my_adapter.adapter_up.weight
  - bert.encoder.layer.0.output.adapters.my_adapter.adapter_up.bias
  - bert.encoder.layer.1.output.adapters.my_adapter.adapter_down.0.weight
  - bert.encoder.layer.1.output.adapters.my_adapter.adapter_down.0.bias
  - bert.encoder.layer.1.output.adapters.my_adapter.adapter_up.weight
  - bert.encoder.layer.1.output.adapters.my_adapter.adapter_up.bias
  - bert.encoder.layer.2.output.adapters.my_adapter.adapter_down.0.weight
  - bert.encoder.layer.2.output.adapters.my_adapter.adapter_down.0.bias
  - bert.encoder.layer.2.output.adapters.my_adapter.adapter_up.weight
  - bert.encoder.layer.2.output.adapters.my_adapte

# Data Augmentation (Back-Translation):

In [37]:
!pip install deep-translator



In [125]:
import time, random
from datetime import timedelta

# Sample 30% of the English training data
num_augment = int(0.3 * len(df_train_eng))
sampled_indices = random.sample(range(len(df_train_eng)), num_augment)

augmented_texts = []
augmented_labels = []

print(f"Generating {num_augment} augmented samples...\n")
start_time = time.time()

for i, idx in enumerate(sampled_indices, 1):  # Start index at 1 for nicer output
    orig_text = df_train_eng.iloc[idx]['text']
    back_text = back_translate(orig_text, src='en', mid='sw')
    augmented_texts.append(back_text)
    augmented_labels.append(df_train_eng.iloc[idx]['labels'])

    # Optional delay to avoid rate limiting
    time.sleep(1.5)

    # Show progress every 50 samples
    if i % 50 == 0 or i == num_augment:
        elapsed = time.time() - start_time
        avg_time = elapsed / i
        eta = avg_time * (num_augment - i)
        print(f"[{i}/{num_augment}] - Elapsed: {timedelta(seconds=int(elapsed))} - ETA: {timedelta(seconds=int(eta))}")

print("\nAugmentation completed!")


# Create augmented DataFrame
df_aug = pd.DataFrame({
    "text": augmented_texts,
    "labels": augmented_labels
})

# Combine with original data
df_train_augmented = pd.concat([df_train_eng, df_aug], ignore_index=True)

# Re-tokenize and encode
train_encodings_aug = tokenize_function(df_train_augmented["text"].tolist())
y_train_aug = mlb.transform(df_train_augmented['labels']).astype(np.float32)
train_dataset_aug = EmotionDataset(train_encodings_aug, y_train_aug)


Generating 829 augmented samples...

[50/829] - Elapsed: 0:02:20 - ETA: 0:36:33
[100/829] - Elapsed: 0:04:45 - ETA: 0:34:37
[150/829] - Elapsed: 0:07:13 - ETA: 0:32:40
[200/829] - Elapsed: 0:09:37 - ETA: 0:30:15
[250/829] - Elapsed: 0:12:11 - ETA: 0:28:15
[300/829] - Elapsed: 0:14:30 - ETA: 0:25:34
[350/829] - Elapsed: 0:17:10 - ETA: 0:23:30
[400/829] - Elapsed: 0:19:44 - ETA: 0:21:10
[450/829] - Elapsed: 0:21:56 - ETA: 0:18:28
[500/829] - Elapsed: 0:24:29 - ETA: 0:16:06
[550/829] - Elapsed: 0:26:45 - ETA: 0:13:34
[600/829] - Elapsed: 0:29:00 - ETA: 0:11:04
[650/829] - Elapsed: 0:31:23 - ETA: 0:08:38
[700/829] - Elapsed: 0:33:37 - ETA: 0:06:11
[750/829] - Elapsed: 0:36:11 - ETA: 0:03:48
[800/829] - Elapsed: 0:38:52 - ETA: 0:01:24
[829/829] - Elapsed: 0:40:10 - ETA: 0:00:00

Augmentation completed!


In [126]:
print(f"Original training samples: {len(df_train_eng)}")
print(f"Augmented samples generated: {len(df_aug)}")
print(f"Total training samples after augmentation: {len(df_train_augmented)}")


print("\nSample augmented texts and labels:")
print(df_aug.head())

print("\nSample combined training data:")
print(df_train_augmented.sample(5))


print(f"Encoded input keys: {train_encodings_aug.keys()}")
print(f"Number of encoded samples: {len(next(iter(train_encodings_aug.values())))}")

print(f"Labels shape: {y_train_aug.shape}")
print(f"First label vector example: {y_train_aug[0]}")

print(f"Dataset length: {len(train_dataset_aug)}")
sample_item = train_dataset_aug[0]
print(f"Sample dataset item keys: {sample_item.keys()}")
print(f"Sample labels tensor: {sample_item['labels']}")


Original training samples: 2764
Augmented samples generated: 829
Total training samples after augmentation: 3593

Sample augmented texts and labels:
                                                text                 labels
0                      Was a bad feeling of darkness        [fear, sadness]
1  At the beginning was the same when I let my sl...                  [joy]
2  I'm glad that someone was there but then again...   [fear, joy, sadness]
3                     yikes no pun intended or count  [fear, joy, surprise]
4  Initially sent on September 18, 2007 I walked ...        [fear, sadness]

Sample combined training data:
                           id  \
3340                      NaN   
761   eng_train_track_a_00762   
1276  eng_train_track_a_01277   
647   eng_train_track_a_00648   
1617  eng_train_track_a_01619   

                                                   text  anger  disgust  fear  \
3340  Enthusiastic and filled with Hope I went into ...    NaN      NaN   NaN   
76

# Evaluation and Training