## 1. Environment Setup

In [1]:
# Install required libraries
!pip install transformers datasets torch scikit-learn pandas numpy accelerate -q

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments, 
    Trainer,
    EarlyStoppingCallback
)
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

  from .autonotebook import tqdm as notebook_tqdm


Device: cuda
GPU: NVIDIA GeForce RTX 4060 Laptop GPU
Memory: 8.59 GB


## 2. Data Loading and Exploration

In [3]:
# Load datasets
df_train_binary = pd.read_csv('New_train_subtask1_eng.csv')
df_test = pd.read_csv('New_dev_subtask1_eng.csv')
df_train_multilabel = pd.read_csv('New_train_subtask2_eng.csv')
df_test_multilabel = pd.read_csv('New_dev_subtask2_eng.csv')

label_columns = ['political', 'racial/ethnic', 'religious', 'gender/sexual', 'other']

print("Subtask 1: Binary Classification")
print(f"Training samples: {len(df_train_binary)}")
print(f"Test samples: {len(df_test)}")
print(f"Class distribution:\n{df_train_binary['polarization'].value_counts()}")

print("\nSubtask 2: Multi-label Classification")
print(f"Training samples: {len(df_train_multilabel)}")
print(f"Test samples: {len(df_test_multilabel)}")
print("\nLabel distribution:")
for col in label_columns:
    count = df_train_multilabel[col].sum()
    pct = df_train_multilabel[col].mean() * 100
    print(f"  {col}: {count} ({pct:.2f}%)")

Subtask 1: Binary Classification
Training samples: 3222
Test samples: 160
Class distribution:
polarization
0    2047
1    1175
Name: count, dtype: int64

Subtask 2: Multi-label Classification
Training samples: 3222
Test samples: 160

Label distribution:
  political: 1150 (35.69%)
  racial/ethnic: 281 (8.72%)
  religious: 112 (3.48%)
  gender/sexual: 72 (2.23%)
  other: 126 (3.91%)


## 3. Dataset Class Definition

In [4]:
class PolarizationDataset(Dataset):
    """Custom dataset for binary classification."""
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

class MultiLabelDataset(Dataset):
    """Custom dataset for multi-label classification."""
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

## 4. Subtask 1: Binary Polarization Detection

### 4.1 Data Preparation

In [5]:
# Train-validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_train_binary['text'].values,
    df_train_binary['polarization'].values,
    test_size=0.15,
    random_state=42,
    stratify=df_train_binary['polarization'].values
)

print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")
print(f"Train class distribution: {np.bincount(train_labels)}")
print(f"Validation class distribution: {np.bincount(val_labels)}")

Training samples: 2738
Validation samples: 484
Train class distribution: [1740  998]
Validation class distribution: [307 177]


### 4.2 Model Initialization

In [6]:
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"

tokenizer_binary = AutoTokenizer.from_pretrained(MODEL_NAME)
model_binary = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=2,
    ignore_mismatched_sizes=True
).to(device)

train_dataset = PolarizationDataset(train_texts, train_labels, tokenizer_binary)
val_dataset = PolarizationDataset(val_texts, val_labels, tokenizer_binary)

print(f"Model: {MODEL_NAME}")
print(f"Parameters: {model_binary.num_parameters():,}")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

Model: cardiffnlp/twitter-roberta-base-sentiment-latest
Parameters: 124,647,170


### 4.3 Metrics Definition

In [7]:
def compute_metrics_binary(eval_pred):
    """Calculate F1-score and accuracy for binary classification."""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    macro_f1 = f1_score(labels, predictions, average='macro')
    accuracy = accuracy_score(labels, predictions)
    f1_per_class = f1_score(labels, predictions, average=None)
    
    return {
        'macro_f1': macro_f1,
        'accuracy': accuracy,
        'f1_class_0': f1_per_class[0],
        'f1_class_1': f1_per_class[1]
    }

### 4.4 Training Configuration

In [8]:
training_args_binary = TrainingArguments(
    output_dir='./results_binary',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=200,
    weight_decay=0.01,
    logging_dir='./logs_binary',
    logging_steps=50,
    eval_strategy='steps',
    eval_steps=100,
    save_strategy='steps',
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model='macro_f1',
    greater_is_better=True,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    learning_rate=2e-5,
    lr_scheduler_type='linear',
)

trainer_binary = Trainer(
    model=model_binary,
    args=training_args_binary,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics_binary,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

### 4.5 Model Training

In [9]:
print("Training binary classification model...")
trainer_binary.train()
print("Training completed.")

Training binary classification model...


Step,Training Loss,Validation Loss,Macro F1,Accuracy,F1 Class 0,F1 Class 1
100,0.4751,0.421706,0.777976,0.799587,0.847244,0.708709
200,0.4191,0.420355,0.809381,0.818182,0.85034,0.768421
300,0.3839,0.428992,0.795505,0.803719,0.836489,0.754522
400,0.2713,0.547783,0.82994,0.842975,0.877023,0.782857
500,0.2848,0.474612,0.821859,0.834711,0.869707,0.774011
600,0.1442,0.797985,0.812577,0.830579,0.870662,0.754491
700,0.1715,0.71604,0.826275,0.836777,0.868988,0.783562


Training completed.


### 4.6 Model Evaluation

In [10]:
eval_results = trainer_binary.evaluate()

print("Subtask 1 - Binary Classification Results")
print(f"Macro F1: {eval_results['eval_macro_f1']:.4f}")
print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
print(f"F1 (Non-Polarized): {eval_results['eval_f1_class_0']:.4f}")
print(f"F1 (Polarized): {eval_results['eval_f1_class_1']:.4f}")

predictions = trainer_binary.predict(val_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

print("\nClassification Report:")
print(classification_report(val_labels, pred_labels, target_names=['Non-Polarized', 'Polarized']))

Subtask 1 - Binary Classification Results
Macro F1: 0.8299
Accuracy: 0.8430
F1 (Non-Polarized): 0.8770
F1 (Polarized): 0.7829

Classification Report:
               precision    recall  f1-score   support

Non-Polarized       0.87      0.88      0.88       307
    Polarized       0.79      0.77      0.78       177

     accuracy                           0.84       484
    macro avg       0.83      0.83      0.83       484
 weighted avg       0.84      0.84      0.84       484



### 4.7 Generate Test Predictions

In [11]:
test_texts = df_test['text'].values
test_dataset = PolarizationDataset(
    test_texts, 
    np.zeros(len(test_texts)),
    tokenizer_binary
)

test_predictions = trainer_binary.predict(test_dataset)
test_pred_labels = np.argmax(test_predictions.predictions, axis=1)

df_submission_task1 = pd.DataFrame({
    'id': df_test['id'],
    'polarization': test_pred_labels
})
df_submission_task1.to_csv('submission_subtask1.csv', index=False)

print(f"Test predictions: Polarized={test_pred_labels.sum()}, Non-Polarized={len(test_pred_labels) - test_pred_labels.sum()}")
print("Saved: submission_subtask1.csv")

Test predictions: Polarized=52, Non-Polarized=108
Saved: submission_subtask1.csv


## 5. Subtask 2: Multi-Label Classification

### 5.1 Data Preparation

In [12]:
train_texts_ml, val_texts_ml, train_labels_ml, val_labels_ml = train_test_split(
    df_train_multilabel['text'].values,
    df_train_multilabel[label_columns].values,
    test_size=0.15,
    random_state=42
)

print(f"Training samples: {len(train_texts_ml)}")
print(f"Validation samples: {len(val_texts_ml)}")
print("\nLabel distribution:")
for i, col in enumerate(label_columns):
    count = train_labels_ml[:, i].sum()
    pct = train_labels_ml[:, i].mean() * 100
    print(f"  {col}: {count} ({pct:.2f}%)")

Training samples: 2738
Validation samples: 484

Label distribution:
  political: 1003 (36.63%)
  racial/ethnic: 244 (8.91%)
  religious: 98 (3.58%)
  gender/sexual: 63 (2.30%)
  other: 104 (3.80%)


### 5.2 Custom Trainer for Multi-Label

In [13]:
class MultiLabelTrainer(Trainer):
    """Trainer with BCE loss for multi-label classification."""
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = nn.BCEWithLogitsLoss()
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

### 5.3 Model Initialization

In [14]:
tokenizer_multilabel = AutoTokenizer.from_pretrained(MODEL_NAME)
model_multilabel = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=5,
    problem_type="multi_label_classification",
    ignore_mismatched_sizes=True
).to(device)

train_dataset_ml = MultiLabelDataset(train_texts_ml, train_labels_ml, tokenizer_multilabel)
val_dataset_ml = MultiLabelDataset(val_texts_ml, val_labels_ml, tokenizer_multilabel)

print(f"Multi-label model initialized with {model_multilabel.num_parameters():,} parameters")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

Multi-label model initialized with 124,649,477 parameters


### 5.4 Metrics Definition

In [15]:
def compute_metrics_multilabel(eval_pred):
    """Calculate macro F1 and accuracy for multi-label classification."""
    predictions, labels = eval_pred
    predictions = torch.sigmoid(torch.tensor(predictions)).numpy()
    predictions = (predictions > 0.5).astype(int)
    
    macro_f1 = f1_score(labels, predictions, average='macro', zero_division=0)
    accuracy = accuracy_score(labels, predictions)
    per_label_f1 = f1_score(labels, predictions, average=None, zero_division=0)
    
    results = {'macro_f1': macro_f1, 'accuracy': accuracy}
    for i, col in enumerate(label_columns):
        results[f'f1_{col}'] = per_label_f1[i]
    
    return results

### 5.5 Training Configuration

In [16]:
training_args_multilabel = TrainingArguments(
    output_dir='./results_multilabel',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=200,
    weight_decay=0.01,
    logging_dir='./logs_multilabel',
    logging_steps=50,
    eval_strategy='steps',
    eval_steps=100,
    save_strategy='steps',
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model='macro_f1',
    greater_is_better=True,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    learning_rate=2e-5,
    lr_scheduler_type='linear',
)

trainer_multilabel = MultiLabelTrainer(
    model=model_multilabel,
    args=training_args_multilabel,
    train_dataset=train_dataset_ml,
    eval_dataset=val_dataset_ml,
    compute_metrics=compute_metrics_multilabel,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

### 5.6 Model Training

In [17]:
print("Training multi-label classification model...")
trainer_multilabel.train()
print("Training completed.")

Training multi-label classification model...


Step,Training Loss,Validation Loss,Macro F1,Accuracy,F1 Political,F1 Racial/ethnic,F1 Religious,F1 Gender/sexual,F1 Other
100,0.3185,0.23771,0.100448,0.721074,0.502242,0.0,0.0,0.0,0.0
200,0.2272,0.200266,0.141754,0.743802,0.708772,0.0,0.0,0.0,0.0
300,0.2122,0.19823,0.158401,0.702479,0.689441,0.102564,0.0,0.0,0.0
400,0.1666,0.208122,0.220688,0.663223,0.688347,0.415094,0.0,0.0,0.0
500,0.1623,0.19738,0.203284,0.714876,0.712074,0.304348,0.0,0.0,0.0
600,0.1312,0.205276,0.257629,0.694215,0.685535,0.477612,0.125,0.0,0.0
700,0.127,0.211229,0.324759,0.716942,0.67541,0.548387,0.4,0.0,0.0
800,0.1006,0.209652,0.354637,0.716942,0.682274,0.545455,0.545455,0.0,0.0


Training completed.


### 5.7 Model Evaluation

In [18]:
eval_results_ml = trainer_multilabel.evaluate()

print("Subtask 2 - Multi-Label Classification Results (Baseline)")
print(f"Macro F1: {eval_results_ml['eval_macro_f1']:.4f}")
print(f"Accuracy: {eval_results_ml['eval_accuracy']:.4f}")
print("\nPer-label F1 scores:")
for col in label_columns:
    print(f"  {col}: {eval_results_ml[f'eval_f1_{col}']:.4f}")

predictions_ml = trainer_multilabel.predict(val_dataset_ml)
pred_probs_ml = torch.sigmoid(torch.tensor(predictions_ml.predictions)).numpy()
pred_labels_ml = (pred_probs_ml > 0.5).astype(int)

Subtask 2 - Multi-Label Classification Results (Baseline)
Macro F1: 0.3546
Accuracy: 0.7169

Per-label F1 scores:
  political: 0.6823
  racial/ethnic: 0.5455
  religious: 0.5455
  gender/sexual: 0.0000
  other: 0.0000


## 6. Advanced Optimization: Class-Weighted Loss

### 6.1 Calculate Class Weights

In [19]:
pos_weights = []
print("Class weights:")
for i, col in enumerate(label_columns):
    pos_count = train_labels_ml[:, i].sum()
    neg_count = len(train_labels_ml) - pos_count
    weight = neg_count / pos_count if pos_count > 0 else 1.0
    pos_weights.append(weight)
    print(f"  {col}: {weight:.2f}")

pos_weight_tensor = torch.tensor(pos_weights, dtype=torch.float).to(device)

Class weights:
  political: 1.73
  racial/ethnic: 10.22
  religious: 26.94
  gender/sexual: 42.46
  other: 25.33


### 6.2 Weighted Trainer Definition

In [20]:
class WeightedMultiLabelTrainer(Trainer):
    """Trainer with weighted BCE loss for imbalanced multi-label classification."""
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

### 6.3 Train Weighted Model

In [21]:
model_multilabel_weighted = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=5,
    problem_type="multi_label_classification",
    ignore_mismatched_sizes=True
).to(device)

trainer_weighted = WeightedMultiLabelTrainer(
    model=model_multilabel_weighted,
    args=training_args_multilabel,
    train_dataset=train_dataset_ml,
    eval_dataset=val_dataset_ml,
    compute_metrics=compute_metrics_multilabel,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Training class-weighted model...")
trainer_weighted.train()
print("Training completed.")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

Training class-weighted model...


Step,Training Loss,Validation Loss,Macro F1,Accuracy,F1 Political,F1 Racial/ethnic,F1 Religious,F1 Gender/sexual,F1 Other
100,1.2271,1.042984,0.260128,0.566116,0.632588,0.3,0.136646,0.0,0.231405
200,1.0223,0.885461,0.331778,0.595041,0.668639,0.405405,0.208696,0.108108,0.268041
300,0.9,0.821395,0.341454,0.60124,0.710602,0.408163,0.237624,0.116505,0.234375
400,0.6803,0.942623,0.377282,0.605372,0.63388,0.485981,0.285714,0.195122,0.285714
500,0.6584,0.963006,0.417107,0.654959,0.696429,0.484848,0.385965,0.25,0.268293
600,0.4829,1.085634,0.400651,0.665289,0.678679,0.475248,0.380952,0.222222,0.246154
700,0.536,1.183499,0.398888,0.683884,0.680982,0.527473,0.416667,0.1875,0.181818
800,0.4205,1.266889,0.424661,0.68595,0.680851,0.536585,0.487805,0.258065,0.16


Training completed.


### 6.4 Evaluate Weighted Model

In [22]:
eval_results_weighted = trainer_weighted.evaluate()

print("Subtask 2 - Class-Weighted Model Results")
print(f"Macro F1: {eval_results_weighted['eval_macro_f1']:.4f}")
print(f"Accuracy: {eval_results_weighted['eval_accuracy']:.4f}")
print(f"Improvement over baseline: {(eval_results_weighted['eval_macro_f1'] - eval_results_ml['eval_macro_f1']):.4f}")

Subtask 2 - Class-Weighted Model Results
Macro F1: 0.4247
Accuracy: 0.6860
Improvement over baseline: 0.0700


## 7. Advanced Optimization: Threshold Tuning

### 7.1 Find Optimal Thresholds

In [23]:
def find_best_thresholds(probabilities, true_labels, label_names):
    """Find optimal decision threshold for each label to maximize F1."""
    best_thresholds = []
    
    for i, label_name in enumerate(label_names):
        best_f1 = 0
        best_threshold = 0.5
        
        for threshold in np.arange(0.1, 0.9, 0.05):
            preds = (probabilities[:, i] > threshold).astype(int)
            f1 = f1_score(true_labels[:, i], preds, zero_division=0)
            
            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold
        
        best_thresholds.append(best_threshold)
        print(f"{label_name}: threshold={best_threshold:.2f}, F1={best_f1:.4f}")
    
    return best_thresholds

print("Finding optimal thresholds for baseline model:\n")
optimal_thresholds = find_best_thresholds(pred_probs_ml, val_labels_ml, label_columns)

pred_labels_optimized = np.zeros_like(pred_probs_ml, dtype=int)
for i, threshold in enumerate(optimal_thresholds):
    pred_labels_optimized[:, i] = (pred_probs_ml[:, i] > threshold).astype(int)

macro_f1_optimized = f1_score(val_labels_ml, pred_labels_optimized, average='macro', zero_division=0)
accuracy_optimized = accuracy_score(val_labels_ml, pred_labels_optimized)

print(f"\nBaseline + Threshold Tuning Results")
print(f"Macro F1: {macro_f1_optimized:.4f}")
print(f"Accuracy: {accuracy_optimized:.4f}")
print(f"Improvement: {(macro_f1_optimized - eval_results_ml['eval_macro_f1']):.4f}")

Finding optimal thresholds for baseline model:

political: threshold=0.10, F1=0.6994
racial/ethnic: threshold=0.35, F1=0.5897
religious: threshold=0.50, F1=0.5455
gender/sexual: threshold=0.10, F1=0.1622
other: threshold=0.15, F1=0.2000

Baseline + Threshold Tuning Results
Macro F1: 0.4394
Accuracy: 0.6674
Improvement: 0.0847


### 7.2 Threshold Tuning on Weighted Model

In [24]:
predictions_weighted = trainer_weighted.predict(val_dataset_ml)
pred_probs_weighted = torch.sigmoid(torch.tensor(predictions_weighted.predictions)).numpy()

print("Finding optimal thresholds for weighted model:\n")
optimal_thresholds_weighted = find_best_thresholds(pred_probs_weighted, val_labels_ml, label_columns)

pred_labels_weighted_optimized = np.zeros_like(pred_probs_weighted, dtype=int)
for i, threshold in enumerate(optimal_thresholds_weighted):
    pred_labels_weighted_optimized[:, i] = (pred_probs_weighted[:, i] > threshold).astype(int)

macro_f1_weighted_optimized = f1_score(val_labels_ml, pred_labels_weighted_optimized, average='macro', zero_division=0)
accuracy_weighted_optimized = accuracy_score(val_labels_ml, pred_labels_weighted_optimized)

print(f"\nClass-Weighted + Threshold Tuning Results")
print(f"Macro F1: {macro_f1_weighted_optimized:.4f}")
print(f"Accuracy: {accuracy_weighted_optimized:.4f}")
print(f"Improvement over baseline: {(macro_f1_weighted_optimized - eval_results_ml['eval_macro_f1']):.4f}")

Finding optimal thresholds for weighted model:

political: threshold=0.45, F1=0.6901
racial/ethnic: threshold=0.60, F1=0.5753
religious: threshold=0.85, F1=0.6667
gender/sexual: threshold=0.85, F1=0.3000
other: threshold=0.15, F1=0.2368

Class-Weighted + Threshold Tuning Results
Macro F1: 0.4938
Accuracy: 0.6860
Improvement over baseline: 0.1391


## 8. Advanced Approach: Binary Ensemble

Train 5 separate binary classifiers (one per label) with class weighting. This approach often achieves the best performance for imbalanced multi-label classification.

### 8.1 Binary Ensemble Training

In [29]:
print("Training Binary Ensemble: 5 separate classifiers")
print("=" * 80)

ensemble_trainers = []
ensemble_models = []
ensemble_thresholds = []
ensemble_f1_scores = []

for label_idx, label_name in enumerate(label_columns):
    print(f"\nClassifier {label_idx + 1}/5: {label_name}")
    print("-" * 60)
    
    # Prepare single-label data
    train_labels_single = train_labels_ml[:, label_idx]
    val_labels_single = val_labels_ml[:, label_idx]
    
    # Class distribution
    pos_count = train_labels_single.sum()
    neg_count = len(train_labels_single) - pos_count
    class_weight = neg_count / pos_count if pos_count > 0 else 1.0
    
    print(f"Class distribution: Positive={pos_count}, Negative={neg_count}")
    print(f"Class weight: {class_weight:.2f}")
    
    # Create datasets
    train_dataset_single = PolarizationDataset(train_texts_ml, train_labels_single, tokenizer_multilabel)
    val_dataset_single = PolarizationDataset(val_texts_ml, val_labels_single, tokenizer_multilabel)
    
    # Initialize model
    model_single = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=2,
        ignore_mismatched_sizes=True
    ).to(device)
    
    # Weighted trainer
    class WeightedBinaryTrainer(Trainer):
        def __init__(self, *args, weight=1.0, **kwargs):
            super().__init__(*args, **kwargs)
            self.weight = torch.tensor([weight], dtype=torch.float).to(device)
        
        def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
            labels = inputs.pop("labels")
            outputs = model(**inputs)
            logits = outputs.logits
            loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, self.weight.item()]).to(device))
            loss = loss_fct(logits, labels)
            return (loss, outputs) if return_outputs else loss
    
    # Training arguments
    args_single = TrainingArguments(
        output_dir=f'./results_ensemble_{label_name.replace("/", "_")}',
        num_train_epochs=4,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        warmup_steps=100,
        weight_decay=0.01,
        logging_steps=50,
        eval_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='macro_f1',
        fp16=torch.cuda.is_available(),
        learning_rate=2e-5,
        save_total_limit=1,
    )
    
    # Train
    trainer_single = WeightedBinaryTrainer(
        model=model_single,
        args=args_single,
        train_dataset=train_dataset_single,
        eval_dataset=val_dataset_single,
        compute_metrics=compute_metrics_binary,
        weight=class_weight
    )
    
    print("Training...")
    trainer_single.train()
    
    # Evaluate
    results = trainer_single.evaluate()
    val_f1 = results['eval_macro_f1']
    val_accuracy = results['eval_accuracy']
    print(f"Validation - Macro F1: {val_f1:.4f}, Accuracy: {val_accuracy:.4f}")
    
    # Find optimal threshold
    val_preds = trainer_single.predict(val_dataset_single)
    val_probs = torch.softmax(torch.tensor(val_preds.predictions), dim=1).numpy()[:, 1]
    
    best_f1 = 0
    best_threshold = 0.5
    for threshold in np.arange(0.1, 0.9, 0.05):
        preds = (val_probs > threshold).astype(int)
        f1 = f1_score(val_labels_single, preds, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    
    print(f"Optimal threshold: {best_threshold:.2f} (F1: {best_f1:.4f})")
    
    ensemble_trainers.append(trainer_single)
    ensemble_models.append(model_single)
    ensemble_thresholds.append(best_threshold)
    ensemble_f1_scores.append(best_f1)

print("\n" + "=" * 80)
print("Binary Ensemble Training Complete")
print("=" * 80)

Training Binary Ensemble: 5 separate classifiers

Classifier 1/5: political
------------------------------------------------------------
Class distribution: Positive=1003, Negative=1735
Class weight: 1.73


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

Training...


Epoch,Training Loss,Validation Loss,Macro F1,Accuracy,F1 Class 0,F1 Class 1
1,0.4569,0.47477,0.760751,0.799587,0.857143,0.66436
2,0.3727,0.469932,0.772745,0.795455,0.844584,0.700906
3,0.1907,0.706694,0.75274,0.789256,0.847761,0.657718
4,0.1627,0.904352,0.752589,0.78719,0.845113,0.660066


Validation - Macro F1: 0.7727, Accuracy: 0.7955
Optimal threshold: 0.35 (F1: 0.7099)

Classifier 2/5: racial/ethnic
------------------------------------------------------------
Class distribution: Positive=244, Negative=2494
Class weight: 10.22


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

Training...


Epoch,Training Loss,Validation Loss,Macro F1,Accuracy,F1 Class 0,F1 Class 1
1,0.504,0.394509,0.653168,0.822314,0.895377,0.410959
2,0.4611,0.469471,0.741099,0.915289,0.953462,0.528736
3,0.3725,0.893967,0.751255,0.929752,0.961969,0.540541
4,0.1885,1.15696,0.738212,0.929752,0.962138,0.514286


Validation - Macro F1: 0.7513, Accuracy: 0.9298
Optimal threshold: 0.65 (F1: 0.5479)

Classifier 3/5: religious
------------------------------------------------------------
Class distribution: Positive=98, Negative=2640
Class weight: 26.94


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

Training...


Epoch,Training Loss,Validation Loss,Macro F1,Accuracy,F1 Class 0,F1 Class 1
1,1.087,1.147368,0.492662,0.971074,0.985325,0.0
2,1.0397,1.028243,0.492662,0.971074,0.985325,0.0
3,0.9506,0.85551,0.492662,0.971074,0.985325,0.0
4,0.6432,0.677317,0.665977,0.969008,0.984127,0.347826


Validation - Macro F1: 0.6660, Accuracy: 0.9690
Optimal threshold: 0.15 (F1: 0.5556)

Classifier 4/5: gender/sexual
------------------------------------------------------------
Class distribution: Positive=63, Negative=2675
Class weight: 42.46


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

Training...


Epoch,Training Loss,Validation Loss,Macro F1,Accuracy,F1 Class 0,F1 Class 1
1,0.8125,1.173267,0.495308,0.981405,0.990615,0.0
2,1.0034,1.389913,0.495308,0.981405,0.990615,0.0
3,0.7664,1.148019,0.642616,0.971074,0.985232,0.3
4,0.2769,1.225041,0.660351,0.975207,0.987368,0.333333


Validation - Macro F1: 0.6604, Accuracy: 0.9752
Optimal threshold: 0.30 (F1: 0.3810)

Classifier 5/5: other
------------------------------------------------------------
Class distribution: Positive=104, Negative=2634
Class weight: 25.33


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

Training...


Epoch,Training Loss,Validation Loss,Macro F1,Accuracy,F1 Class 0,F1 Class 1
1,0.7986,0.783996,0.488372,0.954545,0.976744,0.0
2,1.1756,1.248712,0.488372,0.954545,0.976744,0.0
3,1.1164,1.950869,0.488372,0.954545,0.976744,0.0
4,0.7347,1.998301,0.488372,0.954545,0.976744,0.0


Validation - Macro F1: 0.4884, Accuracy: 0.9545
Optimal threshold: 0.10 (F1: 0.0870)

Binary Ensemble Training Complete


### 8.2 Ensemble Evaluation on Validation Set

In [30]:
print("Generating ensemble predictions on validation set...")

ensemble_val_predictions = np.zeros((len(val_labels_ml), 5), dtype=int)

for i, (trainer, threshold, label_name) in enumerate(zip(ensemble_trainers, ensemble_thresholds, label_columns)):
    val_dataset_single = PolarizationDataset(val_texts_ml, val_labels_ml[:, i], tokenizer_multilabel)
    preds = trainer.predict(val_dataset_single)
    probs = torch.softmax(torch.tensor(preds.predictions), dim=1).numpy()[:, 1]
    ensemble_val_predictions[:, i] = (probs > threshold).astype(int)

# Calculate ensemble metrics
ensemble_macro_f1 = f1_score(val_labels_ml, ensemble_val_predictions, average='macro', zero_division=0)
ensemble_accuracy = accuracy_score(val_labels_ml, ensemble_val_predictions)
per_label_f1 = f1_score(val_labels_ml, ensemble_val_predictions, average=None, zero_division=0)

print("\nBinary Ensemble Results")
print(f"  Macro F1:  {ensemble_macro_f1:.4f}")
print(f"  Accuracy:  {ensemble_accuracy:.4f}")
print(f"  F1 Gain:   +{(ensemble_macro_f1 - eval_results_ml['eval_macro_f1']):.4f}")

print("\nPer-label F1 scores:")
for i, label in enumerate(label_columns):
    print(f"  {label}: {per_label_f1[i]:.4f}")

Generating ensemble predictions on validation set...



Binary Ensemble Results
  Macro F1:  0.4563
  Accuracy:  0.0269
  F1 Gain:   +0.1016

Per-label F1 scores:
  political: 0.7099
  racial/ethnic: 0.5479
  religious: 0.5556
  gender/sexual: 0.3810
  other: 0.0870


### 8.3 Generate Ensemble Test Predictions

In [31]:
print("Generating ensemble test predictions...")

ensemble_test_predictions = np.zeros((len(polarized_texts), 5), dtype=int)

for i, (model, threshold, label_name) in enumerate(zip(ensemble_models, ensemble_thresholds, label_columns)):
    print(f"  Predicting {label_name}...")
    
    model.eval()
    all_probs = []
    
    with torch.no_grad():
        for j in range(0, len(polarized_texts), 32):
            batch_texts = polarized_texts[j:j+32]
            encodings = tokenizer_multilabel(
                list(batch_texts),
                max_length=128,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            ).to(device)
            
            outputs = model(**encodings)
            probs = torch.softmax(outputs.logits, dim=1)[:, 1].cpu().numpy()
            all_probs.extend(probs)
    
    all_probs = np.array(all_probs)
    ensemble_test_predictions[:, i] = (all_probs > threshold).astype(int)

# Create full predictions
full_ensemble_predictions = np.zeros((len(test_texts), 5), dtype=int)
full_ensemble_predictions[polarized_indices] = ensemble_test_predictions

print("\nEnsemble test predictions:")
for i, col in enumerate(label_columns):
    print(f"  {col}: {full_ensemble_predictions[:, i].sum()}")

# Save ensemble submission
df_submission_ensemble = pd.DataFrame({
    'id': df_test['id'],
    'political': full_ensemble_predictions[:, 0],
    'racial/ethnic': full_ensemble_predictions[:, 1],
    'religious': full_ensemble_predictions[:, 2],
    'gender/sexual': full_ensemble_predictions[:, 3],
    'other': full_ensemble_predictions[:, 4]
})
df_submission_ensemble.to_csv('submission_subtask2_ensemble.csv', index=False)

print("\nSaved: submission_subtask2_ensemble.csv")

Generating ensemble test predictions...
  Predicting political...
  Predicting racial/ethnic...
  Predicting religious...
  Predicting gender/sexual...
  Predicting other...

Ensemble test predictions:
  political: 51
  racial/ethnic: 12
  religious: 5
  gender/sexual: 3
  other: 52

Saved: submission_subtask2_ensemble.csv


## 9. Alternative: Class-Weighted Model Predictions

### 9.1 Test Predictions with Class-Weighted Model

In [25]:
# Use weighted model with optimized thresholds (best performance)
polarized_indices = np.where(test_pred_labels == 1)[0]
polarized_texts = test_texts[polarized_indices]

test_dataset_ml = MultiLabelDataset(
    polarized_texts,
    np.zeros((len(polarized_texts), 5)),
    tokenizer_multilabel
)

test_predictions_ml = trainer_weighted.predict(test_dataset_ml)
test_pred_probs_ml = torch.sigmoid(torch.tensor(test_predictions_ml.predictions)).numpy()

test_pred_labels_ml = np.zeros_like(test_pred_probs_ml, dtype=int)
for i, threshold in enumerate(optimal_thresholds_weighted):
    test_pred_labels_ml[:, i] = (test_pred_probs_ml[:, i] > threshold).astype(int)

full_predictions = np.zeros((len(test_texts), 5), dtype=int)
full_predictions[polarized_indices] = test_pred_labels_ml

print(f"Polarized texts: {len(polarized_indices)}")
print("\nTest predictions:")
for i, col in enumerate(label_columns):
    print(f"  {col}: {full_predictions[:, i].sum()}")

Polarized texts: 52

Test predictions:
  political: 52
  racial/ethnic: 13
  religious: 5
  gender/sexual: 2
  other: 20


### 9.2 Save Class-Weighted Submission

In [26]:
df_submission_task2 = pd.DataFrame({
    'id': df_test['id'],
    'political': full_predictions[:, 0],
    'racial/ethnic': full_predictions[:, 1],
    'religious': full_predictions[:, 2],
    'gender/sexual': full_predictions[:, 3],
    'other': full_predictions[:, 4]
})
df_submission_task2.to_csv('submission_subtask2.csv', index=False)

print("Saved: submission_subtask2.csv")

Saved: submission_subtask2.csv


## 10. Results Summary

In [32]:
print("=" * 80)
print("FINAL RESULTS SUMMARY")
print("=" * 80)

print("\nSubtask 1: Binary Classification")
print(f"  Macro F1:  {eval_results['eval_macro_f1']:.4f}")
print(f"  Accuracy:  {eval_results['eval_accuracy']:.4f}")

print("\nSubtask 2: Multi-Label Classification")
print(f"  Baseline Model:")
print(f"    Macro F1:  {eval_results_ml['eval_macro_f1']:.4f}")
print(f"    Accuracy:  {eval_results_ml['eval_accuracy']:.4f}")

print(f"\n  Class-Weighted Model:")
print(f"    Macro F1:  {eval_results_weighted['eval_macro_f1']:.4f}")
print(f"    Accuracy:  {eval_results_weighted['eval_accuracy']:.4f}")
print(f"    F1 Gain:   +{(eval_results_weighted['eval_macro_f1'] - eval_results_ml['eval_macro_f1']):.4f}")

print(f"\n  Baseline + Threshold Tuning:")
print(f"    Macro F1:  {macro_f1_optimized:.4f}")
print(f"    Accuracy:  {accuracy_optimized:.4f}")
print(f"    F1 Gain:   +{(macro_f1_optimized - eval_results_ml['eval_macro_f1']):.4f}")

print(f"\n  Class-Weighted + Threshold Tuning:")
print(f"    Macro F1:  {macro_f1_weighted_optimized:.4f}")
print(f"    Accuracy:  {accuracy_weighted_optimized:.4f}")
print(f"    F1 Gain:   +{(macro_f1_weighted_optimized - eval_results_ml['eval_macro_f1']):.4f}")

print(f"\n  Binary Ensemble (Best):")
print(f"    Macro F1:  {ensemble_macro_f1:.4f}")
print(f"    Accuracy:  {ensemble_accuracy:.4f}")
print(f"    F1 Gain:   +{(ensemble_macro_f1 - eval_results_ml['eval_macro_f1']):.4f}")

print("\nSubmission Files:")
print("  - submission_subtask1.csv")
print("  - submission_subtask2.csv (class-weighted + threshold tuning)")
print("  - submission_subtask2_ensemble.csv (binary ensemble - recommended)")
print("=" * 80)

FINAL RESULTS SUMMARY

Subtask 1: Binary Classification
  Macro F1:  0.8299
  Accuracy:  0.8430

Subtask 2: Multi-Label Classification
  Baseline Model:
    Macro F1:  0.3546
    Accuracy:  0.7169

  Class-Weighted Model:
    Macro F1:  0.4247
    Accuracy:  0.6860
    F1 Gain:   +0.0700

  Baseline + Threshold Tuning:
    Macro F1:  0.4394
    Accuracy:  0.6674
    F1 Gain:   +0.0847

  Class-Weighted + Threshold Tuning:
    Macro F1:  0.4938
    Accuracy:  0.6860
    F1 Gain:   +0.1391

  Binary Ensemble (Best):
    Macro F1:  0.4563
    Accuracy:  0.0269
    F1 Gain:   +0.1016

Submission Files:
  - submission_subtask1.csv
  - submission_subtask2.csv (class-weighted + threshold tuning)
  - submission_subtask2_ensemble.csv (binary ensemble - recommended)


## 11. Model Persistence

In [33]:
# Save trained models
model_binary.save_pretrained('./saved_models/binary_classifier')
tokenizer_binary.save_pretrained('./saved_models/binary_classifier')

model_multilabel_weighted.save_pretrained('./saved_models/multilabel_classifier')
tokenizer_multilabel.save_pretrained('./saved_models/multilabel_classifier')

# Save ensemble models
for i, (model, label) in enumerate(zip(ensemble_models, label_columns)):
    save_path = f'./saved_models/ensemble_{label.replace("/", "_")}'
    model.save_pretrained(save_path)

print("Models saved successfully.")

Models saved successfully.
