# Imports

In [17]:
!pip install scikit-multilearn



In [19]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F

import gc # Garbage collection to manage memory
from collections import Counter
from pathlib import Path

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import recall_score, precision_score, f1_score

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from torch.utils.data import Dataset

In [20]:
import wandb

# Disable wandb logging for this script
wandb.init(mode="disabled")

In [22]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Data Analysis

## Custom Plots

In [11]:
"""
Comprehensive EDA for SemEval-2026 Task 9 Paper
Generates publication-quality SVG plots for LaTeX import

Fixed: DataFrame truth value ambiguity error
"""

# ==========================================
# CONFIGURATION
# ==========================================
class Config:
    # Path
    BASE_PATH = "/content/gdrive/MyDrive/SemEval/dev_phase"
    OUTPUT_DIR = "./paper_figures"

    # File paths
    FILES = {
        'eng_s1_train': f"{BASE_PATH}/subtask1/train/eng.csv",
        'arb_s1_train': f"{BASE_PATH}/subtask1/train/arb.csv",
        'eng_s2_train': f"{BASE_PATH}/subtask2/train/eng.csv",
        'arb_s2_train': f"{BASE_PATH}/subtask2/train/arb.csv",
        'eng_s3_train': f"{BASE_PATH}/subtask3/train/eng.csv",
        'arb_s3_train': f"{BASE_PATH}/subtask3/train/arb.csv",
    }

    # Label definitions (from your CSVs)
    LABELS = {
        's1': ['polarization'],
        's2': ['political', 'racial/ethnic', 'religious', 'gender/sexual', 'other'],
        's3': ['stereotype', 'vilification', 'dehumanization',
               'extreme_language', 'lack_of_empathy', 'invalidation']
    }

    # Plot styling for academic papers
    STYLE = {
        'figure.figsize': (10, 6),
        'font.size': 11,
        'axes.labelsize': 12,
        'axes.titlesize': 13,
        'xtick.labelsize': 10,
        'ytick.labelsize': 10,
        'legend.fontsize': 10,
        'font.family': 'serif',
        'font.serif': ['Times New Roman', 'DejaVu Serif'],
    }

# Apply styling
plt.rcParams.update(Config.STYLE)
sns.set_palette("viridis")

# Create output directory
Path(Config.OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

# ==========================================
# UTILITY FUNCTIONS
# ==========================================
def save_figure(fig, filename, dpi=300):
    """Save figure as high-quality SVG"""
    filepath = Path(Config.OUTPUT_DIR) / filename
    fig.savefig(filepath, format='svg', bbox_inches='tight', dpi=dpi)
    print(f"✅ Saved: {filepath}")
    plt.close(fig)

def load_data(key):
    """Load and validate data"""
    try:
        df = pd.read_csv(Config.FILES[key])
        print(f"✅ Loaded {key}: {len(df)} samples")
        return df
    except Exception as e:
        print(f"❌ Error loading {key}: {e}")
        return None

# ==========================================
# FIGURE 1: CLASS IMBALANCE COMPARISON
# ==========================================
def plot_imbalance_comparison():
    """
    Figure for Section 2.1: Class Distribution Imbalance
    Justifies need for Focal Loss
    """
    # Load Subtask 2 data (most imbalanced)
    eng_s2 = load_data('eng_s2_train')
    arb_s2 = load_data('arb_s2_train')

    if eng_s2 is None or arb_s2 is None:
        return

    labels = Config.LABELS['s2']

    # Calculate distributions
    eng_dist = eng_s2[labels].sum().sort_values(ascending=False)
    arb_dist = arb_s2[labels].sum().sort_values(ascending=False)

    # Create figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

    # English
    bars1 = ax1.barh(range(len(eng_dist)), eng_dist.values, color='steelblue', alpha=0.8)
    ax1.set_yticks(range(len(eng_dist)))
    ax1.set_yticklabels(eng_dist.index)
    ax1.set_xlabel('Sample Count')
    ax1.set_title('(a) English Subtask 2', fontweight='bold')
    ax1.grid(axis='x', alpha=0.3)

    # Add count labels
    for i, (bar, val) in enumerate(zip(bars1, eng_dist.values)):
        pct = 100 * val / len(eng_s2)
        ax1.text(val + 20, i, f'{val} ({pct:.1f}%)',
                va='center', fontsize=9)

    # Calculate and display imbalance ratio
    eng_ratio = eng_dist.max() / eng_dist.min()
    ax1.text(0.02, 0.98, f'Imbalance: {eng_ratio:.1f}:1',
            transform=ax1.transAxes, va='top', fontsize=10,
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

    # Arabic
    bars2 = ax2.barh(range(len(arb_dist)), arb_dist.values, color='coral', alpha=0.8)
    ax2.set_yticks(range(len(arb_dist)))
    ax2.set_yticklabels(arb_dist.index)
    ax2.set_xlabel('Sample Count')
    ax2.set_title('(b) Arabic Subtask 2', fontweight='bold')
    ax2.grid(axis='x', alpha=0.3)

    for i, (bar, val) in enumerate(zip(bars2, arb_dist.values)):
        pct = 100 * val / len(arb_s2)
        ax2.text(val + 20, i, f'{val} ({pct:.1f}%)',
                va='center', fontsize=9)

    arb_ratio = arb_dist.max() / arb_dist.min()
    ax2.text(0.02, 0.98, f'Imbalance: {arb_ratio:.1f}:1',
            transform=ax2.transAxes, va='top', fontsize=10,
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

    plt.suptitle('Distribution of Polarization Categories in Training Set',
                fontsize=14, fontweight='bold', y=1.02)
    plt.tight_layout()

    save_figure(fig, 'fig1_class_imbalance.svg')

    # Print stats for paper
    print(f"\n📊 Imbalance Statistics for Paper:")
    print(f"English: {eng_ratio:.1f}:1 ratio")
    print(f"Arabic: {arb_ratio:.1f}:1 ratio")
    print(f"English most common: {eng_dist.index[0]} ({100*eng_dist.iloc[0]/len(eng_s2):.1f}%)")
    print(f"English least common: {eng_dist.index[-1]} ({100*eng_dist.iloc[-1]/len(eng_s2):.1f}%)")

# ==========================================
# FIGURE 2: LABEL CO-OCCURRENCE HEATMAP
# ==========================================
def plot_label_cooccurrence():
    """
    Figure for Section 2.2: Label Co-occurrence
    Shows correlation between labels
    """
    # Use Subtask 3 (manifestations have high correlation)
    arb_s3 = load_data('arb_s3_train')

    if arb_s3 is None:
        return

    labels = Config.LABELS['s3']

    # Calculate correlation
    corr_matrix = arb_s3[labels].corr()

    # Create figure
    fig, ax = plt.subplots(figsize=(10, 8))

    # Custom colormap for better visibility
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)

    sns.heatmap(corr_matrix,
                annot=True,
                fmt='.3f',
                cmap='RdYlBu_r',
                center=0,
                square=True,
                linewidths=1,
                cbar_kws={"shrink": 0.8, "label": "Pearson Correlation"},
                vmin=-0.2, vmax=1.0,
                mask=mask,
                ax=ax)

    ax.set_title('Label Co-occurrence in Arabic Subtask 3\n(Lower triangle shows Pearson correlation)',
                fontweight='bold', pad=20)

    # Highlight high correlations
    high_corr = []
    for i in range(len(labels)):
        for j in range(i+1, len(labels)):
            if corr_matrix.iloc[i, j] > 0.6:
                high_corr.append((labels[i], labels[j], corr_matrix.iloc[i, j]))

    # Add note about high correlations
    note_text = "Strong correlations (r > 0.6):\n"
    for label1, label2, corr in sorted(high_corr, key=lambda x: x[2], reverse=True)[:3]:
        note_text += f"• {label1} ↔ {label2}: {corr:.3f}\n"

    plt.text(0.02, -0.15, note_text.strip(), transform=ax.transAxes,
            fontsize=9, verticalalignment='top',
            bbox=dict(boxstyle='round', facecolor='lightyellow', alpha=0.8))

    plt.tight_layout()
    save_figure(fig, 'fig2_label_correlation.svg')

    print(f"\n📊 High Correlations for Paper:")
    for label1, label2, corr in sorted(high_corr, key=lambda x: x[2], reverse=True):
        print(f"  {label1} ↔ {label2}: r = {corr:.3f}")

# ==========================================
# FIGURE 3: TEXT LENGTH DISTRIBUTION
# ==========================================
def plot_text_length_comparison():
    """
    Shows text length differences between languages
    Justifies max_length parameter choice
    """
    # Load all Subtask 2 data
    eng_s2 = load_data('eng_s2_train')
    arb_s2 = load_data('arb_s2_train')

    if eng_s2 is None or arb_s2 is None:
        return

    # Calculate word counts
    eng_s2['word_count'] = eng_s2['text'].str.split().str.len()
    arb_s2['word_count'] = arb_s2['text'].str.split().str.len()

    # Create figure
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

    # English
    ax1.hist(eng_s2['word_count'], bins=40, color='steelblue',
            alpha=0.7, edgecolor='black', density=True)
    ax1.axvline(eng_s2['word_count'].mean(), color='red',
               linestyle='--', linewidth=2, label=f"Mean: {eng_s2['word_count'].mean():.1f}")
    ax1.axvline(eng_s2['word_count'].quantile(0.95), color='orange',
               linestyle='--', linewidth=2, label=f"95th %ile: {eng_s2['word_count'].quantile(0.95):.0f}")
    ax1.set_xlabel('Word Count')
    ax1.set_ylabel('Density')
    ax1.set_title('(a) English Text Length', fontweight='bold')
    ax1.legend()
    ax1.grid(alpha=0.3)

    # Arabic
    ax2.hist(arb_s2['word_count'], bins=40, color='coral',
            alpha=0.7, edgecolor='black', density=True)
    ax2.axvline(arb_s2['word_count'].mean(), color='red',
               linestyle='--', linewidth=2, label=f"Mean: {arb_s2['word_count'].mean():.1f}")
    ax2.axvline(arb_s2['word_count'].quantile(0.95), color='orange',
               linestyle='--', linewidth=2, label=f"95th %ile: {arb_s2['word_count'].quantile(0.95):.0f}")
    ax2.set_xlabel('Word Count')
    ax2.set_ylabel('Density')
    ax2.set_title('(b) Arabic Text Length', fontweight='bold')
    ax2.legend()
    ax2.grid(alpha=0.3)

    plt.suptitle('Text Length Distribution (Words per Sample)',
                fontsize=14, fontweight='bold', y=1.02)
    plt.tight_layout()

    save_figure(fig, 'fig3_text_length.svg')

    print(f"\n📊 Text Length Stats for Paper:")
    print(f"English: mean={eng_s2['word_count'].mean():.1f}, "
          f"median={eng_s2['word_count'].median():.1f}, "
          f"95th={eng_s2['word_count'].quantile(0.95):.0f}")
    print(f"Arabic: mean={arb_s2['word_count'].mean():.1f}, "
          f"median={arb_s2['word_count'].median():.1f}, "
          f"95th={arb_s2['word_count'].quantile(0.95):.0f}")

# ==========================================
# FIGURE 4: MULTI-LABEL COMPLEXITY
# ==========================================
def plot_multilabel_complexity():
    """
    Shows distribution of labels per sample
    Demonstrates multi-label classification challenge
    """
    eng_s2 = load_data('eng_s2_train')
    arb_s2 = load_data('arb_s2_train')
    eng_s3 = load_data('eng_s3_train')
    arb_s3 = load_data('arb_s3_train')

    # FIXED: Check each DataFrame individually instead of using 'in' operator
    if eng_s2 is None or arb_s2 is None or eng_s3 is None or arb_s3 is None:
        return

    # Calculate labels per sample
    datasets = {
        'English S2': eng_s2[Config.LABELS['s2']].sum(axis=1),
        'Arabic S2': arb_s2[Config.LABELS['s2']].sum(axis=1),
        'English S3': eng_s3[Config.LABELS['s3']].sum(axis=1),
        'Arabic S3': arb_s3[Config.LABELS['s3']].sum(axis=1),
    }

    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    axes = axes.flatten()

    for idx, (name, data) in enumerate(datasets.items()):
        ax = axes[idx]

        # Count distribution
        counts = data.value_counts().sort_index()

        bars = ax.bar(counts.index, counts.values,
                     color='steelblue' if 'English' in name else 'coral',
                     alpha=0.7, edgecolor='black')

        # Add value labels
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{int(height)}',
                   ha='center', va='bottom', fontsize=9)

        # Stats
        mean_labels = data.mean()
        zero_pct = 100 * (data == 0).sum() / len(data)
        multi_pct = 100 * (data > 1).sum() / len(data)

        stats_text = f"Mean: {mean_labels:.2f}\nNo labels: {zero_pct:.1f}%\nMulti-label: {multi_pct:.1f}%"
        ax.text(0.98, 0.98, stats_text, transform=ax.transAxes,
               verticalalignment='top', horizontalalignment='right',
               fontsize=9, bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.7))

        ax.set_xlabel('Number of Labels')
        ax.set_ylabel('Number of Samples')
        ax.set_title(f'({chr(97+idx)}) {name}', fontweight='bold')
        ax.grid(axis='y', alpha=0.3)

    plt.suptitle('Multi-Label Complexity: Labels per Sample',
                fontsize=14, fontweight='bold', y=1.00)
    plt.tight_layout()

    save_figure(fig, 'fig4_multilabel_complexity.svg')

# ==========================================
# TABLE 1: DATASET STATISTICS
# ==========================================
def generate_dataset_stats_table():
    """
    Generates comprehensive statistics table for paper
    """
    print(f"\n{'='*80}")
    print("TABLE: DATASET STATISTICS (Copy to LaTeX)")
    print(f"{'='*80}\n")

    stats = []

    for subtask in ['s1', 's2', 's3']:
        for lang in ['eng', 'arb']:
            key = f"{lang}_{subtask}_train"
            df = load_data(key)

            if df is None:
                continue

            labels = Config.LABELS[subtask]

            # Calculate stats
            n_samples = len(df)
            word_counts = df['text'].str.split().str.len()
            mean_len = word_counts.mean()
            median_len = word_counts.median()

            if subtask == 's1':
                pos_pct = 100 * df[labels[0]].mean()
                stats.append({
                    'Subtask': subtask.upper(),
                    'Lang': lang.upper(),
                    'Samples': n_samples,
                    'Avg Length': f'{mean_len:.1f}',
                    'Positive %': f'{pos_pct:.1f}',
                    'Imbalance': '-'
                })
            else:
                label_sums = df[labels].sum(axis=1)
                no_label_pct = 100 * (label_sums == 0).sum() / n_samples
                multi_pct = 100 * (label_sums > 1).sum() / n_samples
                avg_labels = label_sums.mean()

                # Imbalance
                counts = df[labels].sum()
                imbalance = counts.max() / counts.min()

                stats.append({
                    'Subtask': subtask.upper(),
                    'Lang': lang.upper(),
                    'Samples': n_samples,
                    'Avg Length': f'{mean_len:.1f}',
                    'No Labels %': f'{no_label_pct:.1f}',
                    'Multi %': f'{multi_pct:.1f}',
                    'Avg Labels': f'{avg_labels:.2f}',
                    'Imbalance': f'{imbalance:.1f}:1'
                })

    # Print as formatted table
    df_stats = pd.DataFrame(stats)
    print(df_stats.to_string(index=False))

    # Save as CSV for easy LaTeX import
    csv_path = Path(Config.OUTPUT_DIR) / 'table_dataset_stats.csv'
    df_stats.to_csv(csv_path, index=False)
    print(f"\n✅ Table saved to: {csv_path}")
    print("Use \\csvreader or pandas-to-latex for LaTeX import")

# ==========================================
# MAIN EXECUTION
# ==========================================
def main():
    print("\n" + "="*80)
    print("GENERATING PUBLICATION-QUALITY FIGURES FOR SEMEVAL-2026 PAPER")
    print("="*80 + "\n")

    # Generate all figures
    print("\n📊 Generating Figure 1: Class Imbalance...")
    plot_imbalance_comparison()

    print("\n📊 Generating Figure 2: Label Co-occurrence...")
    plot_label_cooccurrence()

    print("\n📊 Generating Figure 3: Text Length Distribution...")
    plot_text_length_comparison()

    print("\n📊 Generating Figure 4: Multi-label Complexity...")
    plot_multilabel_complexity()

    print("\n📋 Generating Dataset Statistics Table...")
    generate_dataset_stats_table()

    print("\n" + "="*80)
    print("✅ ALL FIGURES GENERATED")
    print(f"📁 Location: {Config.OUTPUT_DIR}/")
    print("="*80)

    print("\n💡 LaTeX Import Tips:")
    print("1. Use \\includegraphics[width=0.8\\textwidth]{fig1_class_imbalance.svg}")
    print("2. SVG files maintain quality at any scale")
    print("3. Reference figures in text: \\ref{fig:imbalance}")
    print("4. Use table_dataset_stats.csv for Table 1")

if __name__ == "__main__":
    main()


GENERATING PUBLICATION-QUALITY FIGURES FOR SEMEVAL-2026 PAPER


📊 Generating Figure 1: Class Imbalance...
✅ Loaded eng_s2_train: 3222 samples
✅ Loaded arb_s2_train: 3380 samples
✅ Saved: paper_figures/fig1_class_imbalance.svg

📊 Imbalance Statistics for Paper:
English: 16.0:1 ratio
Arabic: 2.8:1 ratio
English most common: political (35.7%)
English least common: gender/sexual (2.2%)

📊 Generating Figure 2: Label Co-occurrence...
✅ Loaded arb_s3_train: 3380 samples
✅ Saved: paper_figures/fig2_label_correlation.svg

📊 High Correlations for Paper:
  stereotype ↔ extreme_language: r = 0.665
  stereotype ↔ vilification: r = 0.664
  vilification ↔ extreme_language: r = 0.650

📊 Generating Figure 3: Text Length Distribution...
✅ Loaded eng_s2_train: 3222 samples
✅ Loaded arb_s2_train: 3380 samples
✅ Saved: paper_figures/fig3_text_length.svg

📊 Text Length Stats for Paper:
English: mean=12.3, median=9.0, 95th=27
Arabic: mean=16.7, median=16.0, 95th=31

📊 Generating Figure 4: Multi-label Comple

# Subtask 1

In [16]:
"""
SemEval 2026 Task 9 - Subtask 1: Polarization Detection
Two-Stage Training Strategy (matching Subtasks 2 & 3)

Stage 1: Find optimal thresholds on train/val split
Stage 2: Train on ALL data, predict on test with optimized thresholds

CHANGES FROM ORIGINAL:
- Added two-stage training (was single-stage)
- Separated threshold optimization from final training
- No changes to model architecture, loss, or hyperparameters
"""

# ==========================================
# CONFIGURATION
# ==========================================
class Config:
    # Paths
    # TRAIN_PATH = "/content/gdrive/MyDrive/SemEval/dev_phase/subtask1/train/eng.csv"
    # TEST_PATH = "/content/gdrive/MyDrive/SemEval/dev_phase/subtask1/dev/eng.csv"
    # OUTPUT_PATH = "/content/gdrive/MyDrive/SemEval/dev_phase/subtask1/pred_eng_two_stage.csv"
    TRAIN_PATH = "/content/gdrive/MyDrive/SemEval/dev_phase/subtask1/train/arb.csv"
    TEST_PATH = "/content/gdrive/MyDrive/SemEval/dev_phase/subtask1/dev/arb.csv"
    OUTPUT_PATH = "/content/gdrive/MyDrive/SemEval/dev_phase/subtask1/pred_arb_two_stage.csv"

    # Model
    # MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
    MODEL_NAME = "UBC-NLP/MARBERTv2"
    MAX_LENGTH = 128

    # Training - Two Stages
    STAGE1_EPOCHS = 5  # For threshold finding
    STAGE2_EPOCHS = 6  # For final model on all data
    BATCH_SIZE = 32
    EVAL_BATCH_SIZE = 64
    LEARNING_RATE = 2e-5
    WEIGHT_DECAY = 0.01
    WARMUP_RATIO = 0.1
    EARLY_STOPPING_PATIENCE = 2
    VAL_SIZE = 0.2

    # Regularization
    HIDDEN_DROPOUT = 0.1
    ATTENTION_DROPOUT = 0.1

    # Other
    SEED = 40
    USE_FP16 = True
    TRAIN_FINAL_MODEL = True  # Set False to skip Stage 2 (for quick threshold tuning)

def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

set_seed(Config.SEED)

# ==========================================
# DATASET
# ==========================================
class PolarizationDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding=False,
            max_length=self.max_length,
            return_tensors='pt'
        )

        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# ==========================================
# METRICS
# ==========================================
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)

    f1_macro = f1_score(labels, preds, average='macro')
    f1_binary = f1_score(labels, preds, pos_label=1)

    return {
        'f1_macro': f1_macro,
        'f1_binary': f1_binary
    }

# ==========================================
# TRAINER WITH CLASS WEIGHTS
# ==========================================
class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

# ==========================================
# THRESHOLD OPTIMIZATION
# ==========================================
def optimize_thresholds(val_probs, val_labels):
    """Scan thresholds to find optimal F1 Macro"""
    print(f"\n{'='*60}")
    print(f"THRESHOLD OPTIMIZATION")
    print(f"{'='*60}")

    threshold_range = np.arange(0.3, 0.7, 0.01)
    best_thresh = 0.5
    best_f1 = 0
    threshold_results = []

    for thresh in threshold_range:
        preds = (val_probs[:, 1] >= thresh).astype(int)
        f1 = f1_score(val_labels, preds, average='macro')
        threshold_results.append((thresh, f1))

        if f1 > best_f1:
            best_f1 = f1
            best_thresh = thresh

    print(f"Scanned {len(threshold_range)} thresholds from {threshold_range[0]:.2f} to {threshold_range[-1]:.2f}")
    print(f"Optimal Threshold: {best_thresh:.3f}")
    print(f"Validation F1 at Optimal Threshold: {best_f1:.4f}")

    # Show top 5
    threshold_results.sort(key=lambda x: x[1], reverse=True)
    print(f"\nTop 5 thresholds:")
    for i, (thresh, f1) in enumerate(threshold_results[:5], 1):
        print(f"  {i}. Threshold {thresh:.3f}: F1 = {f1:.4f}")

    return best_thresh, best_f1

# ==========================================
# STAGE 1: FIND OPTIMAL THRESHOLD
# ==========================================
def stage1_find_threshold(train_df):
    """Train on split data to find optimal threshold"""
    print(f"\n{'#'*60}")
    print(f"# STAGE 1: Threshold Optimization")
    print(f"{'#'*60}")

    # Stratified split
    train_split, val_split = train_test_split(
        train_df,
        test_size=Config.VAL_SIZE,
        random_state=Config.SEED,
        stratify=train_df['polarization']
    )

    print(f"\n📊 Split: {len(train_split)} train, {len(val_split)} val")
    print(f"  Train - Class 0: {(train_split['polarization']==0).sum()}, Class 1: {(train_split['polarization']==1).sum()}")
    print(f"  Val   - Class 0: {(val_split['polarization']==0).sum()}, Class 1: {(val_split['polarization']==1).sum()}")

    # Compute class weights
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=np.unique(train_split['polarization']),
        y=train_split['polarization']
    )
    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)
    print(f"\nClass weights: [Class 0: {class_weights[0]:.3f}, Class 1: {class_weights[1]:.3f}]")

    # Initialize
    tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(
        Config.MODEL_NAME,
        num_labels=2,
        ignore_mismatched_sizes=True,
        hidden_dropout_prob=Config.HIDDEN_DROPOUT,
        attention_probs_dropout_prob=Config.ATTENTION_DROPOUT
    )

    # Datasets
    train_dataset = PolarizationDataset(
        train_split['text'].tolist(),
        train_split['polarization'].tolist(),
        tokenizer,
        max_length=Config.MAX_LENGTH
    )
    val_dataset = PolarizationDataset(
        val_split['text'].tolist(),
        val_split['polarization'].tolist(),
        tokenizer,
        max_length=Config.MAX_LENGTH
    )

    # Training args
    training_args = TrainingArguments(
        output_dir="./results_s1_stage1",
        num_train_epochs=Config.STAGE1_EPOCHS,
        learning_rate=Config.LEARNING_RATE,
        per_device_train_batch_size=Config.BATCH_SIZE,
        per_device_eval_batch_size=Config.EVAL_BATCH_SIZE,
        gradient_accumulation_steps=1,
        weight_decay=Config.WEIGHT_DECAY,
        warmup_ratio=Config.WARMUP_RATIO,
        fp16=Config.USE_FP16,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        logging_steps=50,
        save_total_limit=1,
        report_to="none"
    )

    # Train
    trainer = WeightedTrainer(
        class_weights=class_weights_tensor,
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        data_collator=DataCollatorWithPadding(tokenizer),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=Config.EARLY_STOPPING_PATIENCE)]
    )

    print(f"\n🚀 Training Stage 1...")
    trainer.train()

    # Evaluate
    eval_metrics = trainer.evaluate()
    print(f"\n✅ Stage 1 Results (0.5 threshold):")
    print(f"  F1 Macro: {eval_metrics['eval_f1_macro']:.4f}")
    print(f"  F1 Binary (Class 1): {eval_metrics['eval_f1_binary']:.4f}")

    # Get predictions for threshold optimization
    val_predictions = trainer.predict(val_dataset)
    val_probs = torch.nn.functional.softmax(
        torch.tensor(val_predictions.predictions), dim=1
    ).numpy()
    val_labels = val_split['polarization'].values

    # Optimize threshold
    best_thresh, optimized_f1 = optimize_thresholds(val_probs, val_labels)

    print(f"\n🎯 Optimized Validation F1: {optimized_f1:.4f}")
    print(f"   Improvement over 0.5: +{optimized_f1 - eval_metrics['eval_f1_macro']:.4f}")

    # Cleanup
    del model, trainer, tokenizer
    torch.cuda.empty_cache()
    gc.collect()

    return best_thresh, optimized_f1

# ==========================================
# STAGE 2: TRAIN FINAL MODEL
# ==========================================
def stage2_train_final(train_df, test_df, best_thresh):
    """Train on ALL data and predict with optimized threshold"""
    print(f"\n{'#'*60}")
    print(f"# STAGE 2: Final Model Training (All Data)")
    print(f"{'#'*60}")

    print(f"\n📊 Training on ALL {len(train_df)} samples")
    print(f"  Class 0: {(train_df['polarization']==0).sum()}")
    print(f"  Class 1: {(train_df['polarization']==1).sum()}")

    # Compute class weights on full dataset
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=np.unique(train_df['polarization']),
        y=train_df['polarization']
    )
    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

    # Initialize
    tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(
        Config.MODEL_NAME,
        num_labels=2,
        ignore_mismatched_sizes=True,
        hidden_dropout_prob=Config.HIDDEN_DROPOUT,
        attention_probs_dropout_prob=Config.ATTENTION_DROPOUT
    )

    # Dataset
    train_dataset = PolarizationDataset(
        train_df['text'].tolist(),
        train_df['polarization'].tolist(),
        tokenizer,
        max_length=Config.MAX_LENGTH
    )

    # Training args - no validation
    training_args = TrainingArguments(
        output_dir="./results_s1_stage2",
        num_train_epochs=Config.STAGE2_EPOCHS,
        learning_rate=Config.LEARNING_RATE,
        per_device_train_batch_size=Config.BATCH_SIZE,
        gradient_accumulation_steps=1,
        weight_decay=Config.WEIGHT_DECAY,
        warmup_ratio=Config.WARMUP_RATIO,
        fp16=Config.USE_FP16,
        logging_steps=50,
        save_strategy="no",  # No validation, no checkpoints
        report_to="none"
    )

    # Train
    trainer = WeightedTrainer(
        class_weights=class_weights_tensor,
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=DataCollatorWithPadding(tokenizer)
    )

    print(f"\n🚀 Training Final Model...")
    trainer.train()

    # Predict on test
    print(f"\n📊 Predicting on test set...")
    test_dataset = PolarizationDataset(
        test_df['text'].tolist(),
        [0] * len(test_df),  # Dummy labels
        tokenizer,
        max_length=Config.MAX_LENGTH
    )

    test_predictions = trainer.predict(test_dataset)
    test_probs = torch.nn.functional.softmax(
        torch.tensor(test_predictions.predictions), dim=1
    ).numpy()

    # Apply optimized threshold
    final_preds = (test_probs[:, 1] >= best_thresh).astype(int)

    # Cleanup
    del model, trainer, tokenizer
    torch.cuda.empty_cache()
    gc.collect()

    return final_preds

# ==========================================
# MAIN
# ==========================================
def main():
    print(f"\n{'#'*60}")
    print(f"# Subtask 1: Two-Stage Training")
    print(f"# Stage 1: Find threshold on train/val split")
    print(f"# Stage 2: Train on all data, predict with threshold")
    print(f"{'#'*60}")

    # Load
    print(f"\n📂 Loading data...")
    train_df = pd.read_csv(Config.TRAIN_PATH)
    test_df = pd.read_csv(Config.TEST_PATH)

    print(f"✅ Train: {len(train_df)} samples")
    print(f"✅ Test: {len(test_df)} samples")

    # Stats
    print(f"\n📊 Training Set:")
    print(f"  Class 0: {(train_df['polarization']==0).sum()} ({100*(train_df['polarization']==0).mean():.1f}%)")
    print(f"  Class 1: {(train_df['polarization']==1).sum()} ({100*(train_df['polarization']==1).mean():.1f}%)")

    word_lengths = train_df['text'].str.split().str.len()
    print(f"\n  Text length: mean={word_lengths.mean():.1f}, median={word_lengths.median():.1f}, 95th={word_lengths.quantile(0.95):.0f}")

    # Stage 1: Find optimal threshold
    best_thresh, val_f1 = stage1_find_threshold(train_df)

    # Stage 2: Train final model
    if Config.TRAIN_FINAL_MODEL:
        final_preds = stage2_train_final(train_df, test_df, best_thresh)

        # Save
        print(f"\n💾 Saving predictions...")
        submission = pd.DataFrame({
            'id': test_df['id'],
            'polarization': final_preds
        })
        submission.to_csv(Config.OUTPUT_PATH, index=False)

        print(f"\n✅ Saved: {Config.OUTPUT_PATH}")

        # Stats
        print(f"\n📊 Prediction Distribution:")
        unique, counts = np.unique(final_preds, return_counts=True)
        for label, count in zip(unique, counts):
            pct = 100 * count / len(final_preds)
            train_pct = 100 * (train_df['polarization'] == label).mean()
            print(f"  Class {label}: {count} ({pct:.1f}%) | Train: {train_pct:.1f}%")
    else:
        print(f"\n⭐ Skipping Stage 2 (TRAIN_FINAL_MODEL=False)")

    print(f"\n{'='*60}")
    print(f"✅ COMPLETE!")
    print(f"{'='*60}")
    print(f"\nValidation F1: {val_f1:.4f}")
    print(f"Optimal Threshold: {best_thresh:.3f}")

    return best_thresh, val_f1

if __name__ == "__main__":
    threshold, f1 = main()
    print("\n✨ Done! Submit the CSV file to the competition.")


############################################################
# Subtask 1: Two-Stage Training
# Stage 1: Find threshold on train/val split
# Stage 2: Train on all data, predict with threshold
############################################################

📂 Loading data...
✅ Train: 3380 samples
✅ Test: 169 samples

📊 Training Set:
  Class 0: 1868 (55.3%)
  Class 1: 1512 (44.7%)

  Text length: mean=16.7, median=16.0, 95th=31

############################################################
# STAGE 1: Threshold Optimization
############################################################

📊 Split: 2704 train, 676 val
  Train - Class 0: 1494, Class 1: 1210
  Val   - Class 0: 374, Class 1: 302

Class weights: [Class 0: 0.905, Class 1: 1.117]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERTv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🚀 Training Stage 1...


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Binary
1,0.6201,0.444563,0.789343,0.778125
2,0.3012,0.451734,0.791416,0.79049
3,0.208,0.542748,0.806141,0.802413
4,0.1009,0.635459,0.810266,0.791461
5,0.0737,0.674629,0.81539,0.800643



✅ Stage 1 Results (0.5 threshold):
  F1 Macro: 0.8154
  F1 Binary (Class 1): 0.8006

THRESHOLD OPTIMIZATION
Scanned 40 thresholds from 0.30 to 0.69
Optimal Threshold: 0.420
Validation F1 at Optimal Threshold: 0.8185

Top 5 thresholds:
  1. Threshold 0.420: F1 = 0.8185
  2. Threshold 0.430: F1 = 0.8185
  3. Threshold 0.440: F1 = 0.8185
  4. Threshold 0.450: F1 = 0.8185
  5. Threshold 0.460: F1 = 0.8185

🎯 Optimized Validation F1: 0.8185
   Improvement over 0.5: +0.0031

############################################################
# STAGE 2: Final Model Training (All Data)
############################################################

📊 Training on ALL 3380 samples
  Class 0: 1868
  Class 1: 1512


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERTv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🚀 Training Final Model...


Step,Training Loss
50,0.6509
100,0.4375
150,0.3454
200,0.3156
250,0.2243
300,0.1838
350,0.1373
400,0.1146
450,0.083
500,0.0609



📊 Predicting on test set...



💾 Saving predictions...

✅ Saved: /content/gdrive/MyDrive/SemEval/dev_phase/subtask1/pred_arb_two_stage.csv

📊 Prediction Distribution:
  Class 0: 98 (58.0%) | Train: 55.3%
  Class 1: 71 (42.0%) | Train: 44.7%

✅ COMPLETE!

Validation F1: 0.8185
Optimal Threshold: 0.420

✨ Done! Submit the CSV file to the competition.


# Subtask 2

## english

In [9]:
"""
English Subtask 2: Two-Stage Training
Stage 1: Find thresholds on train/val split
Stage 2: Train on ALL data, predict on test
"""

# ==========================================
# CONFIGURATION
# ==========================================
class Config:
    # Paths
    TRAIN_PATH = "/content/gdrive/MyDrive/SemEval/dev_phase/subtask2/train/eng.csv"
    TEST_PATH = "/content/gdrive/MyDrive/SemEval/dev_phase/subtask2/dev/eng.csv"
    OUTPUT_PATH = "/content/gdrive/MyDrive/SemEval/dev_phase/subtask2/pred_eng_two_stage.csv"

    # Labels
    LABEL_COLUMNS = ['political', 'racial/ethnic', 'religious', 'gender/sexual', 'other']

    # Model
    MODEL_NAME = "microsoft/deberta-v3-base"
    MAX_LENGTH = 64

    # Training
    STAGE1_EPOCHS = 5  # For threshold finding
    STAGE2_EPOCHS = 6  # For final model
    BATCH_SIZE = 16
    EVAL_BATCH_SIZE = 32
    LEARNING_RATE = 2e-5
    WEIGHT_DECAY = 0.01
    WARMUP_RATIO = 0.1
    VAL_SIZE = 0.2

    # Regularization
    HIDDEN_DROPOUT = 0.1
    ATTENTION_DROPOUT = 0.1

    # Focal Loss
    USE_FOCAL_LOSS = True
    FOCAL_ALPHA = 0.25
    FOCAL_GAMMA = 2.0  # Higher for English's extreme imbalance

    # Other
    SEED = 42
    USE_FP16 = True
    TRAIN_FINAL_MODEL = True

def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(Config.SEED)

# ==========================================
# FOCAL LOSS
# ==========================================
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.2):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss
        return focal_loss.mean()

# ==========================================
# DATASET
# ==========================================
class MultiLabelDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding=False,
            max_length=self.max_length,
            return_tensors='pt'
        )

        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

# ==========================================
# METRICS
# ==========================================
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    probs = 1 / (1 + np.exp(-predictions))
    preds = (probs > 0.5).astype(int)

    f1_macro = f1_score(labels, preds, average='macro', zero_division=0)
    f1_per_class = f1_score(labels, preds, average=None, zero_division=0)

    metrics = {'f1_macro': f1_macro}
    for i, col in enumerate(Config.LABEL_COLUMNS):
        metrics[f'f1_{col}'] = f1_per_class[i]

    return metrics

# ==========================================
# TRAINER
# ==========================================
class FocalLossTrainer(Trainer):
    def __init__(self, focal_loss_fn, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.focal_loss_fn = focal_loss_fn

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = self.focal_loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

# ==========================================
# THRESHOLD OPTIMIZATION
# ==========================================
def optimize_thresholds(val_probs, val_labels, label_names):
    print(f"\n{'='*60}")
    print(f"THRESHOLD OPTIMIZATION (Class-Aware)")
    print(f"{'='*60}")

    best_thresholds = []
    class_frequencies = val_labels.sum(axis=0) / len(val_labels)

    for i, label_name in enumerate(label_names):
        freq = class_frequencies[i]
        best_thresh = 0.5
        best_f1 = 0

        # Class-specific ranges for extreme English imbalance
        if freq < 0.05:  # Very rare
            threshold_range = np.arange(0.05, 0.7, 0.05)
        elif freq < 0.15:  # Rare
            threshold_range = np.arange(0.1, 0.75, 0.05)
        else:  # Common
            threshold_range = np.arange(0.15, 0.8, 0.05)

        for thresh in threshold_range:
            preds = (val_probs[:, i] >= thresh).astype(int)
            f1 = f1_score(val_labels[:, i], preds, zero_division=0)

            if f1 > best_f1:
                best_f1 = f1
                best_thresh = thresh

        best_thresholds.append(best_thresh)
        print(f"{label_name:20s}: threshold={best_thresh:.2f}, F1={best_f1:.4f} "
              f"(freq={100*freq:.1f}%)")

    return np.array(best_thresholds)

# ==========================================
# STAGE 1: FIND THRESHOLDS
# ==========================================
def stage1_find_thresholds(train_df):
    print(f"\n{'#'*60}")
    print(f"# STAGE 1: Threshold Optimization")
    print(f"{'#'*60}")

    # Split
    has_label = (train_df[Config.LABEL_COLUMNS].sum(axis=1) > 0).astype(int)
    train_data, val_data = train_test_split(
        train_df,
        test_size=Config.VAL_SIZE,
        random_state=Config.SEED,
        stratify=has_label
    )

    print(f"\n📊 Split: {len(train_data)} train, {len(val_data)} val")

    # Initialize
    tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(
        Config.MODEL_NAME,
        num_labels=len(Config.LABEL_COLUMNS),
        problem_type="multi_label_classification",
        hidden_dropout_prob=Config.HIDDEN_DROPOUT,
        attention_probs_dropout_prob=Config.ATTENTION_DROPOUT
    )

    # Datasets
    train_dataset = MultiLabelDataset(
        train_data['text'].tolist(),
        train_data[Config.LABEL_COLUMNS].values,
        tokenizer,
        max_length=Config.MAX_LENGTH
    )
    val_dataset = MultiLabelDataset(
        val_data['text'].tolist(),
        val_data[Config.LABEL_COLUMNS].values,
        tokenizer,
        max_length=Config.MAX_LENGTH
    )

    # Training args
    training_args = TrainingArguments(
        output_dir="./results_eng_s2_stage1",
        num_train_epochs=Config.STAGE1_EPOCHS,
        learning_rate=Config.LEARNING_RATE,
        per_device_train_batch_size=Config.BATCH_SIZE,
        per_device_eval_batch_size=Config.EVAL_BATCH_SIZE,
        gradient_accumulation_steps=2,
        weight_decay=Config.WEIGHT_DECAY,
        warmup_ratio=Config.WARMUP_RATIO,
        fp16=Config.USE_FP16,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        logging_steps=50,
        save_total_limit=1,
        report_to="none"
    )

    # Train
    focal_loss = FocalLoss(alpha=Config.FOCAL_ALPHA, gamma=Config.FOCAL_GAMMA)
    trainer = FocalLossTrainer(
        focal_loss_fn=focal_loss,
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        data_collator=DataCollatorWithPadding(tokenizer),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    print(f"\n🚀 Training Stage 1...")
    trainer.train()

    # Evaluate
    eval_metrics = trainer.evaluate()
    print(f"\n✅ Stage 1 Results:")
    print(f"  F1 Macro: {eval_metrics['eval_f1_macro']:.4f}")

    # Get predictions
    val_predictions = trainer.predict(val_dataset)
    val_logits = val_predictions.predictions
    val_probs = 1 / (1 + np.exp(-val_logits))
    val_labels = val_data[Config.LABEL_COLUMNS].values

    # Optimize thresholds
    best_thresholds = optimize_thresholds(val_probs, val_labels, Config.LABEL_COLUMNS)

    # Calculate optimized F1
    val_preds_optimized = np.zeros_like(val_probs, dtype=int)
    for i, thresh in enumerate(best_thresholds):
        val_preds_optimized[:, i] = (val_probs[:, i] >= thresh).astype(int)

    optimized_f1 = f1_score(val_labels, val_preds_optimized, average='macro', zero_division=0)
    print(f"\n🎯 Optimized Validation F1: {optimized_f1:.4f}")

    # Cleanup
    del model, trainer, tokenizer
    torch.cuda.empty_cache()
    gc.collect()

    return best_thresholds

# ==========================================
# STAGE 2: FINAL MODEL
# ==========================================
def stage2_train_final(train_df, test_df, best_thresholds):
    print(f"\n{'#'*60}")
    print(f"# STAGE 2: Final Model Training (All Data)")
    print(f"{'#'*60}")

    print(f"\n📊 Training on ALL {len(train_df)} samples")

    # Initialize
    tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(
        Config.MODEL_NAME,
        num_labels=len(Config.LABEL_COLUMNS),
        problem_type="multi_label_classification",
        hidden_dropout_prob=Config.HIDDEN_DROPOUT,
        attention_probs_dropout_prob=Config.ATTENTION_DROPOUT
    )

    # Dataset
    train_dataset = MultiLabelDataset(
        train_df['text'].tolist(),
        train_df[Config.LABEL_COLUMNS].values,
        tokenizer,
        max_length=Config.MAX_LENGTH
    )

    # Training args
    training_args = TrainingArguments(
        output_dir="./results_eng_s2_stage2",
        num_train_epochs=Config.STAGE2_EPOCHS,
        learning_rate=Config.LEARNING_RATE,
        per_device_train_batch_size=Config.BATCH_SIZE,
        gradient_accumulation_steps=2,
        weight_decay=Config.WEIGHT_DECAY,
        warmup_ratio=Config.WARMUP_RATIO,
        fp16=Config.USE_FP16,
        logging_steps=50,
        save_strategy="no",
        report_to="none"
    )

    # Train
    focal_loss = FocalLoss(alpha=Config.FOCAL_ALPHA, gamma=Config.FOCAL_GAMMA)
    trainer = FocalLossTrainer(
        focal_loss_fn=focal_loss,
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=DataCollatorWithPadding(tokenizer)
    )

    print(f"\n🚀 Training Final Model...")
    trainer.train()

    # Predict
    print(f"\n📊 Predicting on test set...")
    test_dataset = MultiLabelDataset(
        test_df['text'].tolist(),
        np.zeros((len(test_df), len(Config.LABEL_COLUMNS))),
        tokenizer,
        max_length=Config.MAX_LENGTH
    )

    test_predictions = trainer.predict(test_dataset)
    test_logits = test_predictions.predictions
    test_probs = 1 / (1 + np.exp(-test_logits))

    # Apply thresholds
    final_preds = np.zeros_like(test_probs, dtype=int)
    for i, thresh in enumerate(best_thresholds):
        final_preds[:, i] = (test_probs[:, i] >= thresh).astype(int)

    # Cleanup
    del model, trainer, tokenizer
    torch.cuda.empty_cache()
    gc.collect()

    return final_preds

# ==========================================
# MAIN
# ==========================================
def main():
    print(f"\n{'#'*60}")
    print(f"# English Subtask 2: Two-Stage Training")
    print(f"# Focal Loss γ={Config.FOCAL_GAMMA} for 16:1 imbalance")
    print(f"{'#'*60}")

    # Load
    print(f"\n📂 Loading data...")
    train_df = pd.read_csv(Config.TRAIN_PATH)
    test_df = pd.read_csv(Config.TEST_PATH)

    # Clean
    for col in Config.LABEL_COLUMNS:
        if col in train_df.columns:
            train_df[col] = train_df[col].fillna(0).astype(int)

    print(f"✅ Train: {len(train_df)} samples")
    print(f"✅ Test: {len(test_df)} samples (blind)")

    # Stats
    print(f"\n📊 Label Distribution:")
    for col in Config.LABEL_COLUMNS:
        count = train_df[col].sum()
        pct = 100 * train_df[col].mean()
        print(f"  {col:20s}: {count:4d} ({pct:5.2f}%)")

    # Stage 1
    best_thresholds = stage1_find_thresholds(train_df)

    # Stage 2
    if Config.TRAIN_FINAL_MODEL:
        final_preds = stage2_train_final(train_df, test_df, best_thresholds)

        # Save
        print(f"\n💾 Saving predictions...")
        submission = pd.DataFrame(final_preds, columns=Config.LABEL_COLUMNS)
        submission.insert(0, 'id', test_df['id'])
        submission.to_csv(Config.OUTPUT_PATH, index=False)

        print(f"\n✅ Saved: {Config.OUTPUT_PATH}")

        # Stats
        print(f"\n📊 Prediction Distribution:")
        for i, col in enumerate(Config.LABEL_COLUMNS):
            count = final_preds[:, i].sum()
            pct = 100 * count / len(final_preds)
            train_pct = 100 * train_df[col].mean()
            print(f"  {col:20s}: {count:3d} ({pct:5.2f}%) | Train: {train_pct:5.2f}%")
    else:
        print(f"\n⭐ Skipping Stage 2")

    print(f"\n{'='*60}")
    print(f"✅ COMPLETE!")
    print(f"{'='*60}")

    return best_thresholds

if __name__ == "__main__":
    thresholds = main()
    print("\n✨ Done!")


############################################################
# English Subtask 2: Two-Stage Training
# Focal Loss γ=2.2 for 16:1 imbalance
############################################################

📂 Loading data...
✅ Train: 3222 samples
✅ Test: 160 samples (blind)

📊 Label Distribution:
  political           : 1150 (35.69%)
  racial/ethnic       :  281 ( 8.72%)
  religious           :  112 ( 3.48%)
  gender/sexual       :   72 ( 2.23%)
  other               :  126 ( 3.91%)

############################################################
# STAGE 1: Threshold Optimization
############################################################

📊 Split: 2577 train, 645 val


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🚀 Training Stage 1...


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Political,F1 Racial/ethnic,F1 Religious,F1 Gender/sexual,F1 Other
1,0.0293,0.014687,0.118182,0.590909,0.0,0.0,0.0,0.0
2,0.012,0.012527,0.148659,0.743295,0.0,0.0,0.0,0.0
3,0.0106,0.012539,0.159862,0.738703,0.060606,0.0,0.0,0.0
4,0.0087,0.014068,0.187009,0.735043,0.2,0.0,0.0,0.0
5,0.0075,0.015502,0.195737,0.712018,0.266667,0.0,0.0,0.0



✅ Stage 1 Results:
  F1 Macro: 0.1957

THRESHOLD OPTIMIZATION (Class-Aware)
political           : threshold=0.35, F1=0.7604 (freq=35.7%)
racial/ethnic       : threshold=0.25, F1=0.4615 (freq=9.6%)
religious           : threshold=0.30, F1=0.3000 (freq=4.0%)
gender/sexual       : threshold=0.25, F1=0.2222 (freq=3.1%)
other               : threshold=0.35, F1=0.1772 (freq=3.4%)

🎯 Optimized Validation F1: 0.3843

############################################################
# STAGE 2: Final Model Training (All Data)
############################################################

📊 Training on ALL 3222 samples


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🚀 Training Final Model...


Step,Training Loss
50,0.0268
100,0.0146
150,0.0124
200,0.012
250,0.0101
300,0.0092
350,0.0077
400,0.0076
450,0.0065
500,0.0064



📊 Predicting on test set...



💾 Saving predictions...

✅ Saved: /content/gdrive/MyDrive/SemEval/dev_phase/subtask2/pred_eng_two_stage.csv

📊 Prediction Distribution:
  political           :  63 (39.38%) | Train: 35.69%
  racial/ethnic       :  21 (13.12%) | Train:  8.72%
  religious           :   8 ( 5.00%) | Train:  3.48%
  gender/sexual       :  14 ( 8.75%) | Train:  2.23%
  other               :   5 ( 3.12%) | Train:  3.91%

✅ COMPLETE!

✨ Done!


## arabic

In [None]:
"""
FIXED: Arabic Subtask 2 with MARBERT
Adjusted for Arabic data characteristics:
- Longer texts (16.7 words avg, 31 words 95th percentile)
- Better class balance (2.8:1 vs 16:1)
- Dev set has NO labels (blind test set)

CRITICAL CHANGE: Since dev set has no labels, we:
1. Split training data into train/val (80/20)
2. Optimize thresholds on our validation split
3. Train final model on ALL training data
4. Predict on blind test set
"""

# ==========================================
# CONFIGURATION - OPTIMIZED FOR ARABIC
# ==========================================
class Config:
    # Paths

    TRAIN_PATH = "/content/gdrive/MyDrive/SemEval/dev_phase/subtask2/train/arb.csv"
    TEST_PATH = "/content/gdrive/MyDrive/SemEval/dev_phase/subtask2/dev/arb.csv"
    OUTPUT_PATH = "/content/gdrive/MyDrive/SemEval/dev_phase/subtask2/pred_arb_marbert.csv"

    # Labels
    LABEL_COLUMNS = ['political', 'racial/ethnic', 'religious', 'gender/sexual', 'other']

    # Model
    MODEL_NAME = "UBC-NLP/MARBERT"
    MAX_LENGTH = 96  # INCREASED from 64 (Arabic: 31 words at 95th percentile)

    # Training - Two stages
    STAGE1_EPOCHS = 5  # For finding thresholds
    STAGE2_EPOCHS = 6  # Final model on all data
    BATCH_SIZE = 16
    EVAL_BATCH_SIZE = 32
    LEARNING_RATE = 2e-5
    WEIGHT_DECAY = 0.01
    WARMUP_RATIO = 0.1
    VAL_SIZE = 0.2

    # Regularization - REDUCED (better balance, less overfitting risk)
    HIDDEN_DROPOUT = 0.1  # REDUCED from 0.15
    ATTENTION_DROPOUT = 0.1

    # Loss - ADJUSTED for better balance
    USE_FOCAL_LOSS = True
    FOCAL_ALPHA = 0.25
    FOCAL_GAMMA = 1.5  # REDUCED from 2.0 (less extreme imbalance)

    # Other
    SEED = 42
    USE_FP16 = True
    TRAIN_FINAL_MODEL = True  # Set False for quick threshold tuning only

def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(Config.SEED)

# ==========================================
# FOCAL LOSS
# ==========================================
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=1.5):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss
        return focal_loss.mean()

# ==========================================
# DATASET
# ==========================================
class MultiLabelDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=96):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding=False,
            max_length=self.max_length,
            return_tensors='pt'
        )

        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

# ==========================================
# METRICS
# ==========================================
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    probs = 1 / (1 + np.exp(-predictions))
    preds = (probs > 0.5).astype(int)

    f1_macro = f1_score(labels, preds, average='macro', zero_division=0)
    f1_per_class = f1_score(labels, preds, average=None, zero_division=0)

    metrics = {'f1_macro': f1_macro}
    for i, col in enumerate(Config.LABEL_COLUMNS):
        metrics[f'f1_{col}'] = f1_per_class[i]

    return metrics

# ==========================================
# TRAINER
# ==========================================
class FocalLossTrainer(Trainer):
    def __init__(self, focal_loss_fn, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.focal_loss_fn = focal_loss_fn

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = self.focal_loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

# ==========================================
# THRESHOLD OPTIMIZATION
# ==========================================
def optimize_thresholds(val_probs, val_labels, label_names):
    print(f"\n{'='*60}")
    print(f"THRESHOLD OPTIMIZATION")
    print(f"{'='*60}")

    best_thresholds = []

    for i, label_name in enumerate(label_names):
        best_thresh = 0.5
        best_f1 = 0

        # Wider range for Arabic (better balance)
        for thresh in np.arange(0.15, 0.75, 0.05):
            preds = (val_probs[:, i] >= thresh).astype(int)
            f1 = f1_score(val_labels[:, i], preds, zero_division=0)

            if f1 > best_f1:
                best_f1 = f1
                best_thresh = thresh

        best_thresholds.append(best_thresh)
        print(f"{label_name:20s}: threshold={best_thresh:.2f}, F1={best_f1:.4f}")

    return np.array(best_thresholds)

# ==========================================
# STAGE 1: FIND OPTIMAL THRESHOLDS
# ==========================================
def stage1_find_thresholds(train_df):
    """Train on split data to find optimal thresholds"""
    print(f"\n{'#'*60}")
    print(f"# STAGE 1: Threshold Optimization")
    print(f"{'#'*60}")

    # Split for threshold tuning
    has_label = (train_df[Config.LABEL_COLUMNS].sum(axis=1) > 0).astype(int)
    train_data, val_data = train_test_split(
        train_df,
        test_size=Config.VAL_SIZE,
        random_state=Config.SEED,
        stratify=has_label
    )

    print(f"\n📊 Split: {len(train_data)} train, {len(val_data)} val")

    # Initialize model
    tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(
        Config.MODEL_NAME,
        num_labels=len(Config.LABEL_COLUMNS),
        problem_type="multi_label_classification",
        hidden_dropout_prob=Config.HIDDEN_DROPOUT,
        attention_probs_dropout_prob=Config.ATTENTION_DROPOUT
    )

    # Datasets
    train_dataset = MultiLabelDataset(
        train_data['text'].tolist(),
        train_data[Config.LABEL_COLUMNS].values,
        tokenizer,
        max_length=Config.MAX_LENGTH
    )
    val_dataset = MultiLabelDataset(
        val_data['text'].tolist(),
        val_data[Config.LABEL_COLUMNS].values,
        tokenizer,
        max_length=Config.MAX_LENGTH
    )

    # Training arguments
    training_args = TrainingArguments(
        output_dir="./results_stage1",
        num_train_epochs=Config.STAGE1_EPOCHS,
        learning_rate=Config.LEARNING_RATE,
        per_device_train_batch_size=Config.BATCH_SIZE,
        per_device_eval_batch_size=Config.EVAL_BATCH_SIZE,
        gradient_accumulation_steps=2,
        weight_decay=Config.WEIGHT_DECAY,
        warmup_ratio=Config.WARMUP_RATIO,
        fp16=Config.USE_FP16,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        logging_steps=50,
        save_total_limit=1,
        report_to="none"
    )

    # Train
    focal_loss = FocalLoss(alpha=Config.FOCAL_ALPHA, gamma=Config.FOCAL_GAMMA)
    trainer = FocalLossTrainer(
        focal_loss_fn=focal_loss,
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        data_collator=DataCollatorWithPadding(tokenizer),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    print(f"\n🚀 Training Stage 1...")
    trainer.train()

    # Evaluate
    eval_metrics = trainer.evaluate()
    print(f"\n✅ Stage 1 Results:")
    print(f"  F1 Macro: {eval_metrics['eval_f1_macro']:.4f}")

    # Get validation predictions
    val_predictions = trainer.predict(val_dataset)
    val_logits = val_predictions.predictions
    val_probs = 1 / (1 + np.exp(-val_logits))
    val_labels = val_data[Config.LABEL_COLUMNS].values

    # Optimize thresholds
    best_thresholds = optimize_thresholds(val_probs, val_labels, Config.LABEL_COLUMNS)

    # Cleanup
    del model, trainer, tokenizer
    torch.cuda.empty_cache()
    gc.collect()

    return best_thresholds

# ==========================================
# STAGE 2: TRAIN FINAL MODEL ON ALL DATA
# ==========================================
def stage2_train_final(train_df, test_df, best_thresholds):
    """Train on ALL training data and predict on test"""
    print(f"\n{'#'*60}")
    print(f"# STAGE 2: Final Model Training (All Data)")
    print(f"{'#'*60}")

    print(f"\n📊 Training on ALL {len(train_df)} samples")

    # Initialize model
    tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(
        Config.MODEL_NAME,
        num_labels=len(Config.LABEL_COLUMNS),
        problem_type="multi_label_classification",
        hidden_dropout_prob=Config.HIDDEN_DROPOUT,
        attention_probs_dropout_prob=Config.ATTENTION_DROPOUT
    )

    # Datasets
    train_dataset = MultiLabelDataset(
        train_df['text'].tolist(),
        train_df[Config.LABEL_COLUMNS].values,
        tokenizer,
        max_length=Config.MAX_LENGTH
    )

    # Training arguments - no validation
    training_args = TrainingArguments(
        output_dir="./results_stage2",
        num_train_epochs=Config.STAGE2_EPOCHS,
        learning_rate=Config.LEARNING_RATE,
        per_device_train_batch_size=Config.BATCH_SIZE,
        gradient_accumulation_steps=2,
        weight_decay=Config.WEIGHT_DECAY,
        warmup_ratio=Config.WARMUP_RATIO,
        fp16=Config.USE_FP16,
        logging_steps=50,
        save_strategy="no",  # No validation, no saving
        report_to="none"
    )

    # Train
    focal_loss = FocalLoss(alpha=Config.FOCAL_ALPHA, gamma=Config.FOCAL_GAMMA)
    trainer = FocalLossTrainer(
        focal_loss_fn=focal_loss,
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=DataCollatorWithPadding(tokenizer)
    )

    print(f"\n🚀 Training Final Model...")
    trainer.train()

    # Predict on test
    print(f"\n📊 Predicting on test set...")
    test_dataset = MultiLabelDataset(
        test_df['text'].tolist(),
        np.zeros((len(test_df), len(Config.LABEL_COLUMNS))),
        tokenizer,
        max_length=Config.MAX_LENGTH
    )

    test_predictions = trainer.predict(test_dataset)
    test_logits = test_predictions.predictions
    test_probs = 1 / (1 + np.exp(-test_logits))

    # Apply optimized thresholds
    final_preds = np.zeros_like(test_probs, dtype=int)
    for i, thresh in enumerate(best_thresholds):
        final_preds[:, i] = (test_probs[:, i] >= thresh).astype(int)

    # Cleanup
    del model, trainer, tokenizer
    torch.cuda.empty_cache()
    gc.collect()

    return final_preds

# ==========================================
# MAIN
# ==========================================
def main():
    print(f"\n{'#'*60}")
    print(f"# Arabic Subtask 2: Two-Stage Training")
    print(f"# Stage 1: Find thresholds on train/val split")
    print(f"# Stage 2: Train on all data, predict on test")
    print(f"{'#'*60}")

    # Load data
    print(f"\n📂 Loading data...")
    train_df = pd.read_csv(Config.TRAIN_PATH)
    test_df = pd.read_csv(Config.TEST_PATH)

    # Clean labels
    for col in Config.LABEL_COLUMNS:
        if col in train_df.columns:
            train_df[col] = train_df[col].fillna(0).astype(int)

    print(f"✅ Train: {len(train_df)} samples")
    print(f"✅ Test: {len(test_df)} samples (blind - no labels)")

    # Dataset statistics
    print(f"\n📊 Arabic Dataset Characteristics:")
    print(f"  Text length (mean): {train_df['text'].str.split().str.len().mean():.1f} words")
    print(f"  Text length (95th): {train_df['text'].str.split().str.len().quantile(0.95):.0f} words")
    print(f"  Samples with labels: {(train_df[Config.LABEL_COLUMNS].sum(axis=1) > 0).sum()} ({100*(train_df[Config.LABEL_COLUMNS].sum(axis=1) > 0).mean():.1f}%)")
    print(f"  Multi-label samples: {(train_df[Config.LABEL_COLUMNS].sum(axis=1) > 1).sum()} ({100*(train_df[Config.LABEL_COLUMNS].sum(axis=1) > 1).mean():.1f}%)")

    print(f"\n🏷️  Label Distribution:")
    for col in Config.LABEL_COLUMNS:
        count = train_df[col].sum()
        pct = 100 * train_df[col].mean()
        print(f"  {col:20s}: {count:4d} ({pct:5.2f}%)")

    # Stage 1: Find optimal thresholds
    best_thresholds = stage1_find_thresholds(train_df)

    # Stage 2: Train final model (optional, can skip for quick testing)
    if Config.TRAIN_FINAL_MODEL:
        final_preds = stage2_train_final(train_df, test_df, best_thresholds)

        # Save predictions
        print(f"\n💾 Saving predictions...")
        submission = pd.DataFrame(final_preds, columns=Config.LABEL_COLUMNS)
        submission.insert(0, 'id', test_df['id'])
        submission.to_csv(Config.OUTPUT_PATH, index=False)

        print(f"\n✅ Predictions saved to: {Config.OUTPUT_PATH}")

        # Prediction stats
        print(f"\n📊 Prediction Distribution:")
        for i, col in enumerate(Config.LABEL_COLUMNS):
            count = final_preds[:, i].sum()
            pct = 100 * count / len(final_preds)
            print(f"  {col:20s}: {count:3d} ({pct:5.2f}%)")
    else:
        print(f"\n⏭️  Skipping Stage 2 (TRAIN_FINAL_MODEL=False)")
        print(f"   Set TRAIN_FINAL_MODEL=True to generate final predictions")

    print(f"\n{'='*60}")
    print(f"✅ COMPLETE!")
    print(f"{'='*60}")

    return best_thresholds

if __name__ == "__main__":
    thresholds = main()


############################################################
# Arabic Subtask 2: Two-Stage Training
# Stage 1: Find thresholds on train/val split
# Stage 2: Train on all data, predict on test
############################################################

📂 Loading data...
✅ Train: 3380 samples
✅ Test: 169 samples (blind - no labels)

📊 Arabic Dataset Characteristics:
  Text length (mean): 16.7 words
  Text length (95th): 31 words
  Samples with labels: 1512 (44.7%)
  Multi-label samples: 828 (24.5%)

🏷️  Label Distribution:
  political           :  780 (23.08%)
  racial/ethnic       :  583 (17.25%)
  religious           :  283 ( 8.37%)
  gender/sexual       :  369 (10.92%)
  other               :  565 (16.72%)

############################################################
# STAGE 1: Threshold Optimization
############################################################

📊 Split: 2704 train, 676 val


pytorch_model.bin:   0%|          | 0.00/654M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/654M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🚀 Training Stage 1...


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Political,F1 Racial/ethnic,F1 Religious,F1 Gender/sexual,F1 Other
1,0.0431,0.025721,0.455191,0.695035,0.537736,0.44186,0.329897,0.271429
2,0.0216,0.023479,0.610697,0.70405,0.626728,0.641509,0.573427,0.507772
3,0.0142,0.025719,0.566122,0.713805,0.629108,0.55102,0.461538,0.475138
4,0.0103,0.028881,0.575712,0.711246,0.666667,0.556701,0.52459,0.419355
5,0.0074,0.02946,0.592476,0.714286,0.669643,0.563107,0.558824,0.456522



✅ Stage 1 Results:
  F1 Macro: 0.6107

THRESHOLD OPTIMIZATION
political           : threshold=0.45, F1=0.7225
racial/ethnic       : threshold=0.40, F1=0.6692
religious           : threshold=0.50, F1=0.6415
gender/sexual       : threshold=0.45, F1=0.6087
other               : threshold=0.45, F1=0.5462

############################################################
# STAGE 2: Final Model Training (All Data)
############################################################

📊 Training on ALL 3380 samples


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🚀 Training Final Model...


Step,Training Loss
50,0.0447
100,0.0289
150,0.0218
200,0.0214
250,0.0148
300,0.0145
350,0.0111
400,0.0085
450,0.0072
500,0.006



📊 Predicting on test set...



💾 Saving predictions...

✅ Predictions saved to: /content/gdrive/MyDrive/SemEval/dev_phase/subtask2/pred_arb_marbert.csv

📊 Prediction Distribution:
  political           :  38 (22.49%)
  racial/ethnic       :  32 (18.93%)
  religious           :  11 ( 6.51%)
  gender/sexual       :  15 ( 8.88%)
  other               :  25 (14.79%)

✅ COMPLETE!


# Subtask 3

## english

In [13]:
"""
English Subtask 3: Two-Stage Training (matching Arabic approach)
Stage 1: Find thresholds on train/val split
Stage 2: Train on ALL data, predict on test
"""

# ==========================================
# CONFIGURATION
# ==========================================
class Config:
    # Paths
    TRAIN_PATH = "/content/gdrive/MyDrive/SemEval/dev_phase/subtask3/train/eng.csv"
    TEST_PATH = "/content/gdrive/MyDrive/SemEval/dev_phase/subtask3/dev/eng.csv"
    OUTPUT_PATH = "/content/gdrive/MyDrive/SemEval/dev_phase/subtask3/pred_eng_two_stage.csv"

    # Labels
    LABEL_COLUMNS = ['stereotype', 'vilification', 'dehumanization',
                     'extreme_language', 'lack_of_empathy', 'invalidation']

    # Model
    MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
    MAX_LENGTH = 128

    # Training
    STAGE1_EPOCHS = 5  # For threshold finding
    STAGE2_EPOCHS = 6  # For final model
    BATCH_SIZE = 16
    EVAL_BATCH_SIZE = 32
    LEARNING_RATE = 2e-5
    WEIGHT_DECAY = 0.01
    WARMUP_RATIO = 0.1
    VAL_SIZE = 0.2

    # Regularization
    HIDDEN_DROPOUT = 0.1
    ATTENTION_DROPOUT = 0.1

    # Other
    SEED = 42
    USE_FP16 = True
    TRAIN_FINAL_MODEL = True

def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(Config.SEED)

# ==========================================
# DATASET
# ==========================================
class MultiLabelDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding=False,
            max_length=self.max_length,
            return_tensors='pt'
        )

        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

# ==========================================
# METRICS
# ==========================================
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    probs = 1 / (1 + np.exp(-predictions))
    preds = (probs > 0.5).astype(int)

    f1_macro = f1_score(labels, preds, average='macro', zero_division=0)
    f1_per_class = f1_score(labels, preds, average=None, zero_division=0)

    metrics = {'f1_macro': f1_macro}
    for i, col in enumerate(Config.LABEL_COLUMNS):
        metrics[f'f1_{col}'] = f1_per_class[i]

    return metrics

# ==========================================
# CLASS-WEIGHTED TRAINER
# ==========================================
class WeightedMultiLabelTrainer(Trainer):
    def __init__(self, pos_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.pos_weights = pos_weights

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = nn.BCEWithLogitsLoss(pos_weight=self.pos_weights.to(model.device))
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

# ==========================================
# THRESHOLD OPTIMIZATION
# ==========================================
def optimize_thresholds(val_probs, val_labels, label_names):
    print(f"\n{'='*60}")
    print(f"THRESHOLD OPTIMIZATION")
    print(f"{'='*60}")

    best_thresholds = []

    for i, label_name in enumerate(label_names):
        best_thresh = 0.5
        best_f1 = 0

        # Scan thresholds
        for thresh in np.arange(0.1, 0.9, 0.05):
            preds = (val_probs[:, i] >= thresh).astype(int)
            f1 = f1_score(val_labels[:, i], preds, zero_division=0)

            if f1 > best_f1:
                best_f1 = f1
                best_thresh = thresh

        best_thresholds.append(best_thresh)
        print(f"{label_name:20s}: threshold={best_thresh:.2f}, F1={best_f1:.4f}")

    return np.array(best_thresholds)

# ==========================================
# STAGE 1: FIND THRESHOLDS
# ==========================================
def stage1_find_thresholds(train_df):
    print(f"\n{'#'*60}")
    print(f"# STAGE 1: Threshold Optimization")
    print(f"{'#'*60}")

    # Split
    has_label = (train_df[Config.LABEL_COLUMNS].sum(axis=1) > 0).astype(int)
    train_data, val_data = train_test_split(
        train_df,
        test_size=Config.VAL_SIZE,
        random_state=Config.SEED,
        stratify=has_label
    )

    print(f"\n📊 Split: {len(train_data)} train, {len(val_data)} val")

    # Calculate class weights
    pos_counts = train_data[Config.LABEL_COLUMNS].sum()
    neg_counts = len(train_data) - pos_counts
    pos_weights = neg_counts / (pos_counts + 1e-5)
    pos_weights_tensor = torch.tensor(pos_weights.values, dtype=torch.float)

    print(f"\nClass weights:")
    for col, weight in zip(Config.LABEL_COLUMNS, pos_weights):
        print(f"  {col:20s}: {weight:.3f}")

    # Initialize
    tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(
        Config.MODEL_NAME,
        num_labels=len(Config.LABEL_COLUMNS),
        ignore_mismatched_sizes=True,
        problem_type="multi_label_classification",
        hidden_dropout_prob=Config.HIDDEN_DROPOUT,
        attention_probs_dropout_prob=Config.ATTENTION_DROPOUT
    )

    # Datasets
    train_dataset = MultiLabelDataset(
        train_data['text'].tolist(),
        train_data[Config.LABEL_COLUMNS].values,
        tokenizer,
        max_length=Config.MAX_LENGTH
    )
    val_dataset = MultiLabelDataset(
        val_data['text'].tolist(),
        val_data[Config.LABEL_COLUMNS].values,
        tokenizer,
        max_length=Config.MAX_LENGTH
    )

    # Training args
    training_args = TrainingArguments(
        output_dir="./results_eng_s3_stage1",
        num_train_epochs=Config.STAGE1_EPOCHS,
        learning_rate=Config.LEARNING_RATE,
        per_device_train_batch_size=Config.BATCH_SIZE,
        per_device_eval_batch_size=Config.EVAL_BATCH_SIZE,
        gradient_accumulation_steps=2,
        weight_decay=Config.WEIGHT_DECAY,
        warmup_ratio=Config.WARMUP_RATIO,
        fp16=Config.USE_FP16,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        logging_steps=50,
        save_total_limit=1,
        report_to="none"
    )

    # Train
    trainer = WeightedMultiLabelTrainer(
        pos_weights=pos_weights_tensor,
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        data_collator=DataCollatorWithPadding(tokenizer),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    print(f"\n🚀 Training Stage 1...")
    trainer.train()

    # Evaluate
    eval_metrics = trainer.evaluate()
    print(f"\n✅ Stage 1 Results:")
    print(f"  F1 Macro: {eval_metrics['eval_f1_macro']:.4f}")
    print(f"\n  Per-class F1:")
    for col in Config.LABEL_COLUMNS:
        print(f"    {col:20s}: {eval_metrics[f'eval_f1_{col}']:.4f}")

    # Get predictions
    val_predictions = trainer.predict(val_dataset)
    val_logits = val_predictions.predictions
    val_probs = 1 / (1 + np.exp(-val_logits))
    val_labels = val_data[Config.LABEL_COLUMNS].values

    # Optimize thresholds
    best_thresholds = optimize_thresholds(val_probs, val_labels, Config.LABEL_COLUMNS)

    # Calculate optimized F1
    val_preds_optimized = np.zeros_like(val_probs, dtype=int)
    for i, thresh in enumerate(best_thresholds):
        val_preds_optimized[:, i] = (val_probs[:, i] >= thresh).astype(int)

    optimized_f1 = f1_score(val_labels, val_preds_optimized, average='macro', zero_division=0)
    print(f"\n🎯 Optimized Validation F1: {optimized_f1:.4f}")

    # Cleanup
    del model, trainer, tokenizer
    torch.cuda.empty_cache()
    gc.collect()

    return best_thresholds

# ==========================================
# STAGE 2: FINAL MODEL
# ==========================================
def stage2_train_final(train_df, test_df, best_thresholds):
    print(f"\n{'#'*60}")
    print(f"# STAGE 2: Final Model Training (All Data)")
    print(f"{'#'*60}")

    print(f"\n📊 Training on ALL {len(train_df)} samples")

    # Calculate class weights on full dataset
    pos_counts = train_df[Config.LABEL_COLUMNS].sum()
    neg_counts = len(train_df) - pos_counts
    pos_weights = neg_counts / (pos_counts + 1e-5)
    pos_weights_tensor = torch.tensor(pos_weights.values, dtype=torch.float)

    # Initialize
    tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(
        Config.MODEL_NAME,
        num_labels=len(Config.LABEL_COLUMNS),
        ignore_mismatched_sizes=True,
        problem_type="multi_label_classification",
        hidden_dropout_prob=Config.HIDDEN_DROPOUT,
        attention_probs_dropout_prob=Config.ATTENTION_DROPOUT
    )

    # Dataset
    train_dataset = MultiLabelDataset(
        train_df['text'].tolist(),
        train_df[Config.LABEL_COLUMNS].values,
        tokenizer,
        max_length=Config.MAX_LENGTH
    )

    # Training args
    training_args = TrainingArguments(
        output_dir="./results_eng_s3_stage2",
        num_train_epochs=Config.STAGE2_EPOCHS,
        learning_rate=Config.LEARNING_RATE,
        per_device_train_batch_size=Config.BATCH_SIZE,
        gradient_accumulation_steps=2,
        weight_decay=Config.WEIGHT_DECAY,
        warmup_ratio=Config.WARMUP_RATIO,
        fp16=Config.USE_FP16,
        logging_steps=50,
        save_strategy="no",
        report_to="none"
    )

    # Train
    trainer = WeightedMultiLabelTrainer(
        pos_weights=pos_weights_tensor,
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=DataCollatorWithPadding(tokenizer)
    )

    print(f"\n🚀 Training Final Model...")
    trainer.train()

    # Predict
    print(f"\n📊 Predicting on test set...")
    test_dataset = MultiLabelDataset(
        test_df['text'].tolist(),
        np.zeros((len(test_df), len(Config.LABEL_COLUMNS))),
        tokenizer,
        max_length=Config.MAX_LENGTH
    )

    test_predictions = trainer.predict(test_dataset)
    test_logits = test_predictions.predictions
    test_probs = 1 / (1 + np.exp(-test_logits))

    # Apply thresholds
    final_preds = np.zeros_like(test_probs, dtype=int)
    for i, thresh in enumerate(best_thresholds):
        final_preds[:, i] = (test_probs[:, i] >= thresh).astype(int)

    # Cleanup
    del model, trainer, tokenizer
    torch.cuda.empty_cache()
    gc.collect()

    return final_preds

# ==========================================
# MAIN
# ==========================================
def main():
    print(f"\n{'#'*60}")
    print(f"# English Subtask 3: Two-Stage Training")
    print(f"# Manifestation Identification")
    print(f"{'#'*60}")

    # Load
    print(f"\n📂 Loading data...")
    train_df = pd.read_csv(Config.TRAIN_PATH)
    test_df = pd.read_csv(Config.TEST_PATH)

    # Clean
    for col in Config.LABEL_COLUMNS:
        if col in train_df.columns:
            train_df[col] = train_df[col].fillna(0).astype(int)

    print(f"✅ Train: {len(train_df)} samples")
    print(f"✅ Test: {len(test_df)} samples (blind)")

    # Stats
    print(f"\n📊 Dataset Characteristics:")
    label_sums = train_df[Config.LABEL_COLUMNS].sum(axis=1)
    print(f"  Multi-label density: {100*(label_sums > 1).mean():.1f}%")
    print(f"  Avg labels/sample: {label_sums.mean():.2f}")

    print(f"\n🏷️  Label Distribution:")
    for col in Config.LABEL_COLUMNS:
        count = train_df[col].sum()
        pct = 100 * train_df[col].mean()
        print(f"  {col:20s}: {count:4d} ({pct:5.2f}%)")

    # Stage 1
    best_thresholds = stage1_find_thresholds(train_df)

    # Stage 2
    if Config.TRAIN_FINAL_MODEL:
        final_preds = stage2_train_final(train_df, test_df, best_thresholds)

        # Save
        print(f"\n💾 Saving predictions...")
        submission = pd.DataFrame(final_preds, columns=Config.LABEL_COLUMNS)
        submission.insert(0, 'id', test_df['id'])
        submission.to_csv(Config.OUTPUT_PATH, index=False)

        print(f"\n✅ Saved: {Config.OUTPUT_PATH}")

        # Stats
        print(f"\n📊 Prediction Distribution:")
        for i, col in enumerate(Config.LABEL_COLUMNS):
            count = final_preds[:, i].sum()
            pct = 100 * count / len(final_preds)
            print(f"  {col:20s}: {count:3d} ({pct:5.2f}%)")

        label_sums = final_preds.sum(axis=1)
        no_labels = (label_sums == 0).sum()
        multi_labels = (label_sums > 1).sum()
        print(f"\n  No labels: {no_labels} ({100*no_labels/len(final_preds):.1f}%)")
        print(f"  Multi-label: {multi_labels} ({100*multi_labels/len(final_preds):.1f}%)")
    else:
        print(f"\n⭐ Skipping Stage 2")

    print(f"\n{'='*60}")
    print(f"✅ COMPLETE!")
    print(f"{'='*60}")

    return best_thresholds

if __name__ == "__main__":
    thresholds = main()
    print("\n✨ Done!")


############################################################
# English Subtask 3: Two-Stage Training
# Manifestation Identification
############################################################

📂 Loading data...
✅ Train: 3222 samples
✅ Test: 160 samples (blind)

📊 Dataset Characteristics:
  Multi-label density: 31.2%
  Avg labels/sample: 1.07

🏷️  Label Distribution:
  stereotype          :  487 (15.11%)
  vilification        :  858 (26.63%)
  dehumanization      :  391 (12.14%)
  extreme_language    :  770 (23.90%)
  lack_of_empathy     :  357 (11.08%)
  invalidation        :  586 (18.19%)

############################################################
# STAGE 1: Threshold Optimization
############################################################

📊 Split: 2577 train, 645 val

Class weights:
  stereotype          : 5.642
  vilification        : 2.746
  dehumanization      : 7.207
  extreme_language    : 3.204
  lack_of_empathy     : 8.204
  invalidation        : 4.403


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo


🚀 Training Stage 1...


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Stereotype,F1 Vilification,F1 Dehumanization,F1 Extreme Language,F1 Lack Of Empathy,F1 Invalidation
1,1.0397,0.910913,0.446175,0.410714,0.61657,0.340426,0.566406,0.3125,0.430435
2,0.8019,0.861368,0.494892,0.477064,0.658537,0.408027,0.608911,0.345912,0.470899
3,0.7688,0.866477,0.503219,0.48,0.683603,0.419753,0.621495,0.362018,0.452442
4,0.6714,0.897485,0.508935,0.498516,0.679426,0.431894,0.622549,0.35474,0.466488
5,0.6005,0.89501,0.517794,0.510638,0.695864,0.43686,0.62531,0.369427,0.468665



✅ Stage 1 Results:
  F1 Macro: 0.5178

  Per-class F1:
    stereotype          : 0.5106
    vilification        : 0.6959
    dehumanization      : 0.4369
    extreme_language    : 0.6253
    lack_of_empathy     : 0.3694
    invalidation        : 0.4687

THRESHOLD OPTIMIZATION
stereotype          : threshold=0.65, F1=0.5246
vilification        : threshold=0.65, F1=0.7020
dehumanization      : threshold=0.80, F1=0.4631
extreme_language    : threshold=0.65, F1=0.6440
lack_of_empathy     : threshold=0.70, F1=0.3831
invalidation        : threshold=0.70, F1=0.4940

🎯 Optimized Validation F1: 0.5351

############################################################
# STAGE 2: Final Model Training (All Data)
############################################################

📊 Training on ALL 3222 samples


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo


🚀 Training Final Model...


Step,Training Loss
50,1.0681
100,0.947
150,0.8339
200,0.8369
250,0.7509
300,0.7368
350,0.6776
400,0.6464
450,0.6307
500,0.6258



📊 Predicting on test set...



💾 Saving predictions...

✅ Saved: /content/gdrive/MyDrive/SemEval/dev_phase/subtask3/pred_eng_two_stage.csv

📊 Prediction Distribution:
  stereotype          :  41 (25.62%)
  vilification        :  49 (30.62%)
  dehumanization      :  26 (16.25%)
  extreme_language    :  48 (30.00%)
  lack_of_empathy     :  33 (20.62%)
  invalidation        :  48 (30.00%)

  No labels: 107 (66.9%)
  Multi-label: 48 (30.0%)

✅ COMPLETE!

✨ Done!


## arabic

In [25]:
"""
FIXED: Arabic Subtask 3 with MARBERT
Manifestation Identification

Optimized for Arabic characteristics:
- Higher multi-label density (39% vs 13%)
- Strong correlations (0.65-0.66 between top labels)
- Average 1.37 labels per sample
- Better class balance (4.6:1 vs English)
"""

# ==========================================
# CONFIGURATION
# ==========================================
class Config:
    # Paths

    TRAIN_PATH = "/content/gdrive/MyDrive/SemEval/dev_phase/subtask3/train/arb.csv"
    TEST_PATH = "/content/gdrive/MyDrive/SemEval/dev_phase/subtask3/dev/arb.csv"
    OUTPUT_PATH = "/content/gdrive/MyDrive/SemEval/dev_phase/subtask3/pred_arb_marbert.csv"

    # Labels - 6 manifestation types
    LABEL_COLUMNS = [
        'vilification',
        'extreme_language',
        'stereotype',
        'invalidation',
        'lack_of_empathy',
        'dehumanization'
    ]

    # Model
    MODEL_NAME = "bert-base-multilingual-cased"
    # MODEL_NAME = "UBC-NLP/MARBERT"
    MAX_LENGTH = 96

    # Training
    STAGE1_EPOCHS = 5
    STAGE2_EPOCHS = 6
    BATCH_SIZE = 16
    EVAL_BATCH_SIZE = 32
    LEARNING_RATE = 2e-5
    WEIGHT_DECAY = 0.01
    WARMUP_RATIO = 0.1
    VAL_SIZE = 0.2

    # Regularization
    HIDDEN_DROPOUT = 0.1  # Reduced (better balance)
    ATTENTION_DROPOUT = 0.1

    # Loss
    USE_FOCAL_LOSS = True
    FOCAL_ALPHA = 0.25
    FOCAL_GAMMA = 1.5  # Reduced (better balance than English)

    # Other
    SEED = 42
    USE_FP16 = True
    TRAIN_FINAL_MODEL = True

def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(Config.SEED)

# ==========================================
# FOCAL LOSS
# ==========================================
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=1.5):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss
        return focal_loss.mean()

# ==========================================
# DATASET
# ==========================================
class MultiLabelDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=96):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding=False,
            max_length=self.max_length,
            return_tensors='pt'
        )

        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

# ==========================================
# METRICS
# ==========================================
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    probs = 1 / (1 + np.exp(-predictions))
    preds = (probs > 0.5).astype(int)

    f1_macro = f1_score(labels, preds, average='macro', zero_division=0)
    f1_per_class = f1_score(labels, preds, average=None, zero_division=0)

    metrics = {'f1_macro': f1_macro}
    for i, col in enumerate(Config.LABEL_COLUMNS):
        metrics[f'f1_{col}'] = f1_per_class[i]

    return metrics

# ==========================================
# TRAINER
# ==========================================
class FocalLossTrainer(Trainer):
    def __init__(self, focal_loss_fn, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.focal_loss_fn = focal_loss_fn

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = self.focal_loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

# ==========================================
# THRESHOLD OPTIMIZATION
# ==========================================
def optimize_thresholds(val_probs, val_labels, label_names):
    print(f"\n{'='*60}")
    print(f"THRESHOLD OPTIMIZATION")
    print(f"{'='*60}")

    best_thresholds = []

    for i, label_name in enumerate(label_names):
        best_thresh = 0.5
        best_f1 = 0

        # Scan thresholds
        for thresh in np.arange(0.2, 0.7, 0.05):
            preds = (val_probs[:, i] >= thresh).astype(int)
            f1 = f1_score(val_labels[:, i], preds, zero_division=0)

            if f1 > best_f1:
                best_f1 = f1
                best_thresh = thresh

        best_thresholds.append(best_thresh)
        print(f"{label_name:25s}: threshold={best_thresh:.2f}, F1={best_f1:.4f}")

    return np.array(best_thresholds)

# ==========================================
# STAGE 1: THRESHOLD OPTIMIZATION
# ==========================================
def stage1_find_thresholds(train_df):
    print(f"\n{'#'*60}")
    print(f"# STAGE 1: Threshold Optimization")
    print(f"{'#'*60}")

    # Split
    has_label = (train_df[Config.LABEL_COLUMNS].sum(axis=1) > 0).astype(int)
    train_data, val_data = train_test_split(
        train_df,
        test_size=Config.VAL_SIZE,
        random_state=Config.SEED,
        stratify=has_label
    )

    print(f"\n📊 Split: {len(train_data)} train, {len(val_data)} val")

    # Initialize
    tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(
        Config.MODEL_NAME,
        num_labels=len(Config.LABEL_COLUMNS),
        problem_type="multi_label_classification",
        hidden_dropout_prob=Config.HIDDEN_DROPOUT,
        attention_probs_dropout_prob=Config.ATTENTION_DROPOUT
    )

    # Datasets
    train_dataset = MultiLabelDataset(
        train_data['text'].tolist(),
        train_data[Config.LABEL_COLUMNS].values,
        tokenizer,
        max_length=Config.MAX_LENGTH
    )
    val_dataset = MultiLabelDataset(
        val_data['text'].tolist(),
        val_data[Config.LABEL_COLUMNS].values,
        tokenizer,
        max_length=Config.MAX_LENGTH
    )

    # Training args
    training_args = TrainingArguments(
        output_dir="./results_s3_stage1",
        num_train_epochs=Config.STAGE1_EPOCHS,
        learning_rate=Config.LEARNING_RATE,
        per_device_train_batch_size=Config.BATCH_SIZE,
        per_device_eval_batch_size=Config.EVAL_BATCH_SIZE,
        gradient_accumulation_steps=2,
        weight_decay=Config.WEIGHT_DECAY,
        warmup_ratio=Config.WARMUP_RATIO,
        fp16=Config.USE_FP16,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        logging_steps=50,
        save_total_limit=1,
        report_to="none"
    )

    # Train
    focal_loss = FocalLoss(alpha=Config.FOCAL_ALPHA, gamma=Config.FOCAL_GAMMA)
    trainer = FocalLossTrainer(
        focal_loss_fn=focal_loss,
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        data_collator=DataCollatorWithPadding(tokenizer),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    print(f"\n🚀 Training Stage 1...")
    trainer.train()

    # Evaluate
    eval_metrics = trainer.evaluate()
    print(f"\n✅ Stage 1 Results:")
    print(f"  F1 Macro: {eval_metrics['eval_f1_macro']:.4f}")
    print(f"\n  Per-class F1:")
    for col in Config.LABEL_COLUMNS:
        print(f"    {col:25s}: {eval_metrics[f'eval_f1_{col}']:.4f}")

    # Get predictions
    val_predictions = trainer.predict(val_dataset)
    val_logits = val_predictions.predictions
    val_probs = 1 / (1 + np.exp(-val_logits))
    val_labels = val_data[Config.LABEL_COLUMNS].values

    # Optimize thresholds
    best_thresholds = optimize_thresholds(val_probs, val_labels, Config.LABEL_COLUMNS)

    # Cleanup
    del model, trainer, tokenizer
    torch.cuda.empty_cache()
    gc.collect()

    return best_thresholds

# ==========================================
# STAGE 2: FINAL MODEL
# ==========================================
def stage2_train_final(train_df, test_df, best_thresholds):
    print(f"\n{'#'*60}")
    print(f"# STAGE 2: Final Model Training")
    print(f"{'#'*60}")

    print(f"\n📊 Training on ALL {len(train_df)} samples")

    # Initialize
    tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(
        Config.MODEL_NAME,
        num_labels=len(Config.LABEL_COLUMNS),
        problem_type="multi_label_classification",
        hidden_dropout_prob=Config.HIDDEN_DROPOUT,
        attention_probs_dropout_prob=Config.ATTENTION_DROPOUT
    )

    # Dataset
    train_dataset = MultiLabelDataset(
        train_df['text'].tolist(),
        train_df[Config.LABEL_COLUMNS].values,
        tokenizer,
        max_length=Config.MAX_LENGTH
    )

    # Training args
    training_args = TrainingArguments(
        output_dir="./results_s3_stage2",
        num_train_epochs=Config.STAGE2_EPOCHS,
        learning_rate=Config.LEARNING_RATE,
        per_device_train_batch_size=Config.BATCH_SIZE,
        gradient_accumulation_steps=2,
        weight_decay=Config.WEIGHT_DECAY,
        warmup_ratio=Config.WARMUP_RATIO,
        fp16=Config.USE_FP16,
        logging_steps=50,
        save_strategy="no",
        report_to="none"
    )

    # Train
    focal_loss = FocalLoss(alpha=Config.FOCAL_ALPHA, gamma=Config.FOCAL_GAMMA)
    trainer = FocalLossTrainer(
        focal_loss_fn=focal_loss,
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=DataCollatorWithPadding(tokenizer)
    )

    print(f"\n🚀 Training Final Model...")
    trainer.train()

    # Predict
    print(f"\n📊 Predicting on test set...")
    test_dataset = MultiLabelDataset(
        test_df['text'].tolist(),
        np.zeros((len(test_df), len(Config.LABEL_COLUMNS))),
        tokenizer,
        max_length=Config.MAX_LENGTH
    )

    test_predictions = trainer.predict(test_dataset)
    test_logits = test_predictions.predictions
    test_probs = 1 / (1 + np.exp(-test_logits))

    # Apply thresholds
    final_preds = np.zeros_like(test_probs, dtype=int)
    for i, thresh in enumerate(best_thresholds):
        final_preds[:, i] = (test_probs[:, i] >= thresh).astype(int)

    # Cleanup
    del model, trainer, tokenizer
    torch.cuda.empty_cache()
    gc.collect()

    return final_preds

# ==========================================
# MAIN
# ==========================================
def main():
    print(f"\n{'#'*60}")
    print(f"# Arabic Subtask 3: Two-Stage Training")
    print(f"# Manifestation Identification (6 labels)")
    print(f"{'#'*60}")

    # Load
    print(f"\n📂 Loading data...")
    train_df = pd.read_csv(Config.TRAIN_PATH)
    test_df = pd.read_csv(Config.TEST_PATH)

    # Clean
    for col in Config.LABEL_COLUMNS:
        if col in train_df.columns:
            train_df[col] = train_df[col].fillna(0).astype(int)

    print(f"✅ Train: {len(train_df)} samples")
    print(f"✅ Test: {len(test_df)} samples (blind)")

    # Stats
    print(f"\n📊 Arabic Subtask 3 Characteristics:")
    print(f"  Multi-label density: {100*(train_df[Config.LABEL_COLUMNS].sum(axis=1) > 1).mean():.1f}%")
    print(f"  Avg labels/sample: {train_df[Config.LABEL_COLUMNS].sum(axis=1).mean():.2f}")

    print(f"\n🏷️  Label Distribution:")
    for col in Config.LABEL_COLUMNS:
        count = train_df[col].sum()
        pct = 100 * train_df[col].mean()
        print(f"  {col:25s}: {count:4d} ({pct:5.2f}%)")

    # Stage 1
    best_thresholds = stage1_find_thresholds(train_df)

    # Stage 2
    if Config.TRAIN_FINAL_MODEL:
        final_preds = stage2_train_final(train_df, test_df, best_thresholds)

        # Save
        print(f"\n💾 Saving predictions...")
        submission = pd.DataFrame(final_preds, columns=Config.LABEL_COLUMNS)
        submission.insert(0, 'id', test_df['id'])
        submission.to_csv(Config.OUTPUT_PATH, index=False)

        print(f"\n✅ Saved: {Config.OUTPUT_PATH}")

        # Stats
        print(f"\n📊 Prediction Distribution:")
        for i, col in enumerate(Config.LABEL_COLUMNS):
            count = final_preds[:, i].sum()
            pct = 100 * count / len(final_preds)
            print(f"  {col:25s}: {count:3d} ({pct:5.2f}%)")

        no_labels = (final_preds.sum(axis=1) == 0).sum()
        multi_labels = (final_preds.sum(axis=1) > 1).sum()
        print(f"\n  No labels: {no_labels} ({100*no_labels/len(final_preds):.1f}%)")
        print(f"  Multi-label: {multi_labels} ({100*multi_labels/len(final_preds):.1f}%)")
    else:
        print(f"\n⏭️  Skipping Stage 2")

    print(f"\n{'='*60}")
    print(f"✅ COMPLETE!")
    print(f"{'='*60}")

    return best_thresholds

if __name__ == "__main__":
    thresholds = main()


############################################################
# Arabic Subtask 3: Two-Stage Training
# Manifestation Identification (6 labels)
############################################################

📂 Loading data...
✅ Train: 3380 samples
✅ Test: 169 samples (blind)

📊 Arabic Subtask 3 Characteristics:
  Multi-label density: 39.0%
  Avg labels/sample: 1.37

🏷️  Label Distribution:
  vilification             : 1256 (37.16%)
  extreme_language         : 1027 (30.38%)
  stereotype               : 1127 (33.34%)
  invalidation             :  274 ( 8.11%)
  lack_of_empathy          :  575 (17.01%)
  dehumanization           :  370 (10.95%)

############################################################
# STAGE 1: Threshold Optimization
############################################################

📊 Split: 2704 train, 676 val


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🚀 Training Stage 1...


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Vilification,F1 Extreme Language,F1 Stereotype,F1 Invalidation,F1 Lack Of Empathy,F1 Dehumanization
1,0.0519,0.043796,0.075142,0.367347,0.009434,0.074074,0.0,0.0,0.0
2,0.0373,0.037757,0.274213,0.646943,0.48,0.518337,0.0,0.0,0.0
3,0.033,0.038418,0.392968,0.665574,0.641975,0.635514,0.0,0.414747,0.0
4,0.0288,0.038121,0.379516,0.662921,0.612048,0.632911,0.032258,0.336957,0.0
5,0.027,0.039822,0.410979,0.645161,0.580808,0.601399,0.263158,0.312849,0.0625



✅ Stage 1 Results:
  F1 Macro: 0.4110

  Per-class F1:
    vilification             : 0.6452
    extreme_language         : 0.5808
    stereotype               : 0.6014
    invalidation             : 0.2632
    lack_of_empathy          : 0.3128
    dehumanization           : 0.0625

THRESHOLD OPTIMIZATION
vilification             : threshold=0.35, F1=0.6710
extreme_language         : threshold=0.35, F1=0.6490
stereotype               : threshold=0.35, F1=0.6426
invalidation             : threshold=0.35, F1=0.3727
lack_of_empathy          : threshold=0.40, F1=0.4751
dehumanization           : threshold=0.25, F1=0.3515

############################################################
# STAGE 2: Final Model Training
############################################################

📊 Training on ALL 3380 samples


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🚀 Training Final Model...


Step,Training Loss
50,0.0479
100,0.0394
150,0.0371
200,0.0341
250,0.0301
300,0.0293
350,0.029
400,0.0252
450,0.0248
500,0.0232



📊 Predicting on test set...



💾 Saving predictions...

✅ Saved: /content/gdrive/MyDrive/SemEval/dev_phase/subtask3/pred_arb_marbert.csv

📊 Prediction Distribution:
  vilification             :  69 (40.83%)
  extreme_language         :  61 (36.09%)
  stereotype               :  63 (37.28%)
  invalidation             :  14 ( 8.28%)
  lack_of_empathy          :  30 (17.75%)
  dehumanization           :  41 (24.26%)

  No labels: 97 (57.4%)
  Multi-label: 64 (37.9%)

✅ COMPLETE!
