In [None]:
"""
SemEval Task 9 - Subtask 3: FIXED Manifestation Classification
Target: 0.65+ F1-Macro (realistic for extreme imbalance)

CRITICAL FIXES:
✓ Fixed scipy float16 error
✓ Focal Loss instead of Asymmetric (more stable)
✓ Smaller models to prevent overfitting
✓ Label-wise class weights
✓ Fixed multi-sample dropout implementation
✓ Better threshold tuning
✓ Strong regularization
"""

import os, pandas as pd, numpy as np, torch, torch.nn as nn
import re, random, gc, warnings
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from transformers import (AutoTokenizer, AutoModel,
                         get_cosine_schedule_with_warmup, set_seed)
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive')
set_seed(42)

class Config:
    BASE_PATH = '/content/drive/MyDrive/NLP'
    TRAIN_ENG = f'{BASE_PATH}/subtask3/train/eng.csv'
    TRAIN_SWA = f'{BASE_PATH}/subtask3/train/swa.csv'
    DEV_ENG = f'{BASE_PATH}/subtask3/dev/eng.csv'
    DEV_SWA = f'{BASE_PATH}/subtask3/dev/swa.csv'
    OUTPUT_DIR = '/content/subtask3/models'
    PREDICTIONS_DIR = '/content/subtask_3'
    os.makedirs(PREDICTIONS_DIR, exist_ok=True)
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # Smaller models to prevent overfitting
    MODEL_ENG = 'microsoft/deberta-v3-small'  # 44M params
    MODEL_SWA = 'xlm-roberta-base'  # 270M params
    MAX_LENGTH = 128

    LABELS = ['stereotype', 'vilification', 'dehumanization',
              'extreme_language', 'lack_of_empathy', 'invalidation']
    NUM_LABELS = 6

    # More aggressive regularization
    BATCH_SIZE_ENG = 16
    BATCH_SIZE_SWA = 12
    GRAD_ACCUM = 2
    EPOCHS_ENG = 12
    EPOCHS_SWA = 15
    LR_ENG = 8e-6  # Lower LR
    LR_SWA = 1e-5
    WEIGHT_DECAY = 0.08  # Higher weight decay
    WARMUP_RATIO = 0.15
    DROPOUT = 0.35  # Higher dropout
    MAX_GRAD_NORM = 0.5

    # Focal Loss parameters
    FOCAL_ALPHA = 0.25  # Weight for positive class
    FOCAL_GAMMA = 2.0   # Focusing parameter

    # Inference
    INFERENCE_SAMPLES = 5  # Multi-sample dropout

    VAL_SIZE = 0.20
    USE_FP16 = True
    SEED = 42
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"🚀 Device: {Config.DEVICE}")
if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# ============================================================================
# PREPROCESSING
# ============================================================================
class TextPreprocessor:
    def __init__(self):
        self.contractions = {
            "ain't": "is not", "aren't": "are not", "can't": "cannot",
            "won't": "will not", "don't": "do not", "didn't": "did not",
            "hasn't": "has not", "haven't": "have not", "isn't": "is not",
            "wasn't": "was not", "weren't": "were not", "shouldn't": "should not",
            "wouldn't": "would not", "couldn't": "could not",
        }

    def clean(self, text):
        text = str(text).strip()
        if not text:
            return "[empty]"

        # Preserve case for better semantic understanding
        text = re.sub(r'http\S+|www\.\S+', '[url]', text)
        text = re.sub(r'@\w+', '[user]', text)
        text = re.sub(r'#(\w+)', r'\1', text)
        text = re.sub(r'(.)\1{3,}', r'\1\1', text)

        # Expand contractions
        for c, e in self.contractions.items():
            text = re.sub(r'\b' + c + r'\b', e, text, flags=re.IGNORECASE)

        text = re.sub(r'\s+', ' ', text).strip()
        return text if text else "[empty]"

# ============================================================================
# FOCAL LOSS (More stable than Asymmetric)
# ============================================================================
class FocalLoss(nn.Module):
    """
    Focal Loss for handling class imbalance
    More stable than Asymmetric Loss
    """
    def __init__(self, alpha=0.25, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, logits, targets):
        # Convert to float32 to avoid scipy issues
        logits = logits.float()
        targets = targets.float()

        bce_loss = nn.functional.binary_cross_entropy_with_logits(
            logits, targets, reduction='none'
        )

        probs = torch.sigmoid(logits)
        p_t = probs * targets + (1 - probs) * (1 - targets)

        alpha_t = self.alpha * targets + (1 - self.alpha) * (1 - targets)
        focal_weight = alpha_t * (1 - p_t) ** self.gamma

        loss = focal_weight * bce_loss
        return loss.mean()

# ============================================================================
# CLASS-WEIGHTED FOCAL LOSS
# ============================================================================
class WeightedFocalLoss(nn.Module):
    """Focal Loss with per-label class weights"""
    def __init__(self, label_weights, alpha=0.25, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.label_weights = torch.tensor(label_weights, dtype=torch.float32)

    def forward(self, logits, targets):
        logits = logits.float()
        targets = targets.float()

        device = logits.device
        if self.label_weights.device != device:
            self.label_weights = self.label_weights.to(device)

        bce_loss = nn.functional.binary_cross_entropy_with_logits(
            logits, targets, reduction='none'
        )

        probs = torch.sigmoid(logits)
        p_t = probs * targets + (1 - probs) * (1 - targets)

        alpha_t = self.alpha * targets + (1 - self.alpha) * (1 - targets)
        focal_weight = alpha_t * (1 - p_t) ** self.gamma

        # Apply label-wise weights
        weighted_loss = focal_weight * bce_loss * self.label_weights

        return weighted_loss.mean()

# ============================================================================
# MODEL
# ============================================================================
class MultiLabelModel(nn.Module):
    def __init__(self, model_name, num_labels=6, dropout=0.35):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        hidden_size = self.backbone.config.hidden_size

        # Classification head with extra dropout
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.GELU(),
            nn.Dropout(dropout * 0.8),
            nn.Linear(hidden_size // 2, num_labels)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0, :]  # CLS token
        logits = self.classifier(pooled)
        return logits

# ============================================================================
# DATASET
# ============================================================================
class MultiLabelDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len, label_names):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label_names = label_names
        self.prep = TextPreprocessor()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.prep.clean(self.texts[idx])
        label_vector = self.labels.iloc[idx][self.label_names].values.astype(np.float32)

        enc = self.tokenizer(text, max_length=self.max_len, padding='max_length',
                           truncation=True, return_tensors='pt')

        return {
            'input_ids': enc['input_ids'].squeeze(0),
            'attention_mask': enc['attention_mask'].squeeze(0),
            'labels': torch.tensor(label_vector, dtype=torch.float32)
        }

# ============================================================================
# UTILITIES
# ============================================================================
def load_data(path, label_names):
    df = pd.read_csv(path)
    df['text'] = df['text'].apply(TextPreprocessor().clean)
    df = df[df['text'].str.len() > 0].reset_index(drop=True)

    print(f"  Samples: {len(df)}")
    print(f"  Label Distribution:")

    for label in label_names:
        count = df[label].sum()
        pct = count / len(df) * 100
        print(f"    {label:20s}: {count:5d} ({pct:5.2f}%)")

    label_matrix = df[label_names].values
    samples_with_labels = (label_matrix.sum(axis=1) > 0).sum()
    print(f"  Samples with ≥1 label: {samples_with_labels} ({samples_with_labels/len(df)*100:.1f}%)")

    return df

def compute_class_weights(df, label_names):
    """Compute inverse frequency weights for each label"""
    weights = []
    total = len(df)

    for label in label_names:
        pos_count = df[label].sum()
        if pos_count == 0:
            weight = 1.0
        else:
            # Inverse frequency with smoothing
            neg_count = total - pos_count
            weight = np.sqrt(neg_count / (pos_count + 1))
        weights.append(weight)

    # Normalize
    weights = np.array(weights)
    weights = weights / weights.mean()

    return weights

def find_optimal_thresholds(model, loader, device, label_names, use_fp16=False):
    """Find optimal threshold for each label"""
    print("  Finding optimal thresholds...")
    model.eval()
    all_probs = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(loader, desc="  Collecting", leave=False):
            ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            labels = batch['labels']

            if use_fp16:
                with autocast():
                    logits = model(ids, mask)
                    probs = torch.sigmoid(logits)
            else:
                logits = model(ids, mask)
                probs = torch.sigmoid(logits)

            all_probs.append(probs.cpu().numpy())
            all_labels.append(labels.numpy())

    all_probs = np.vstack(all_probs)
    all_labels = np.vstack(all_labels)

    thresholds = []

    for i, label in enumerate(label_names):
        label_probs = all_probs[:, i]
        label_true = all_labels[:, i]

        pos_count = int(label_true.sum())

        if pos_count < 3:
            thresh = 0.7
            thresholds.append(thresh)
            print(f"    {label:20s}: too few ({pos_count}), using t={thresh:.3f}")
            continue

        # Search threshold space
        thresh_range = np.linspace(0.1, 0.9, 161)
        best_t, best_f1 = 0.5, 0.0

        for t in thresh_range:
            preds = (label_probs >= t).astype(np.float32)
            f1 = f1_score(label_true, preds, average='binary', zero_division=0)
            if f1 > best_f1:
                best_f1, best_t = f1, t

        # Conservative fallback for low F1
        if best_f1 < 0.1 and pos_count < 20:
            best_t = 0.65

        thresholds.append(best_t)
        print(f"    {label:20s}: t={best_t:.3f}, F1={best_f1:.4f} (pos={pos_count})")

    return thresholds

def train_epoch(model, loader, opt, sched, crit, device, grad_accum, scaler=None):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    opt.zero_grad()

    for step, batch in enumerate(tqdm(loader, desc="  Train", leave=False)):
        ids = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        if scaler:
            with autocast():
                logits = model(ids, mask)
                loss = crit(logits, labels) / grad_accum

            scaler.scale(loss).backward()

            # Collect predictions in fp32
            with torch.no_grad():
                probs = torch.sigmoid(logits.float())
                all_preds.append((probs >= 0.5).long().cpu().numpy())
                all_labels.append(labels.cpu().numpy())

            if (step + 1) % grad_accum == 0:
                scaler.unscale_(opt)
                torch.nn.utils.clip_grad_norm_(model.parameters(), Config.MAX_GRAD_NORM)
                scaler.step(opt)
                scaler.update()
                sched.step()
                opt.zero_grad()
        else:
            logits = model(ids, mask)
            loss = crit(logits, labels) / grad_accum

            with torch.no_grad():
                probs = torch.sigmoid(logits)
                all_preds.append((probs >= 0.5).long().cpu().numpy())
                all_labels.append(labels.cpu().numpy())

            loss.backward()

            if (step + 1) % grad_accum == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), Config.MAX_GRAD_NORM)
                opt.step()
                sched.step()
                opt.zero_grad()

        total_loss += loss.item() * grad_accum

    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)
    f1_macro = f1_score(all_labels, all_preds, average='macro', zero_division=0)

    return total_loss / len(loader), f1_macro

def evaluate(model, loader, crit, device, thresholds, label_names, use_fp16=False, show_report=True):
    model.eval()
    total_loss = 0
    all_probs = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(loader, desc="  Eval", leave=False):
            ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            if use_fp16:
                with autocast():
                    logits = model(ids, mask)
                    loss = crit(logits, labels)
                    probs = torch.sigmoid(logits.float())
            else:
                logits = model(ids, mask)
                loss = crit(logits, labels)
                probs = torch.sigmoid(logits)

            total_loss += loss.item()
            all_probs.append(probs.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    all_probs = np.vstack(all_probs)
    all_labels = np.vstack(all_labels)

    # Apply per-label thresholds
    all_preds = np.zeros_like(all_probs)
    for i, thresh in enumerate(thresholds):
        all_preds[:, i] = (all_probs[:, i] >= thresh).astype(float)

    f1_macro = f1_score(all_labels, all_preds, average='macro', zero_division=0)

    if show_report:
        print("\n  Per-label Performance:")
        for i, label in enumerate(label_names):
            f1 = f1_score(all_labels[:, i], all_preds[:, i], average='binary', zero_division=0)
            pred_pos = all_preds[:, i].sum()
            true_pos = all_labels[:, i].sum()
            print(f"    {label:20s}: F1={f1:.4f} (pred={int(pred_pos):4d}, true={int(true_pos):4d})")
        print(f"\n  Macro F1: {f1_macro:.4f}")

    return total_loss / len(loader), f1_macro

def train_model(train_df, lang, model_name, lr, batch_size, epochs, config):
    print(f"\n{'='*70}")
    print(f"TRAINING: {lang.upper()}")
    print(f"{'='*70}")

    # Split
    has_label = (train_df[config.LABELS].sum(axis=1) > 0).astype(int)
    train_data, val_data = train_test_split(
        train_df, test_size=config.VAL_SIZE, random_state=config.SEED,
        stratify=has_label
    )

    print(f"\n  Train: {len(train_data)}, Val: {len(val_data)}")

    # Compute class weights
    class_weights = compute_class_weights(train_data, config.LABELS)
    print(f"\n  Class weights: {dict(zip(config.LABELS, class_weights))}")

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    train_ds = MultiLabelDataset(train_data['text'].values, train_data, tokenizer,
                                 config.MAX_LENGTH, config.LABELS)
    val_ds = MultiLabelDataset(val_data['text'].values, val_data, tokenizer,
                               config.MAX_LENGTH, config.LABELS)

    train_loader = DataLoader(train_ds, batch_size, shuffle=True,
                              num_workers=2, pin_memory=True)
    val_loader = DataLoader(val_ds, batch_size*2, shuffle=False,
                           num_workers=2, pin_memory=True)

    model = MultiLabelModel(model_name, config.NUM_LABELS, config.DROPOUT).to(config.DEVICE)

    opt = torch.optim.AdamW(model.parameters(), lr=lr,
                           weight_decay=config.WEIGHT_DECAY, eps=1e-8)

    steps = len(train_loader) * epochs // config.GRAD_ACCUM
    warmup = int(steps * config.WARMUP_RATIO)
    sched = get_cosine_schedule_with_warmup(opt, warmup, steps)

    # Use Weighted Focal Loss
    crit = WeightedFocalLoss(class_weights, alpha=config.FOCAL_ALPHA, gamma=config.FOCAL_GAMMA)
    print(f"\n  Using Weighted Focal Loss (α={config.FOCAL_ALPHA}, γ={config.FOCAL_GAMMA})")

    scaler = GradScaler() if config.USE_FP16 else None

    best_f1 = 0.0
    best_thresholds = [0.5] * config.NUM_LABELS
    patience, p_cnt = 4, 0

    for ep in range(epochs):
        print(f"\n[Epoch {ep+1}/{epochs}]")

        tr_loss, tr_f1 = train_epoch(model, train_loader, opt, sched, crit,
                                     config.DEVICE, config.GRAD_ACCUM, scaler)

        val_loss, val_f1 = evaluate(model, val_loader, crit, config.DEVICE,
                                    [0.5]*config.NUM_LABELS, config.LABELS,
                                    config.USE_FP16, show_report=False)

        print(f"  Train: Loss={tr_loss:.4f}, F1={tr_f1:.4f}")
        print(f"  Val:   Loss={val_loss:.4f}, F1={val_f1:.4f} (thresh=0.5)")

        # Threshold tuning from epoch 4
        if ep >= 3:
            thresholds = find_optimal_thresholds(model, val_loader, config.DEVICE,
                                                config.LABELS, config.USE_FP16)
            _, val_f1_tuned = evaluate(model, val_loader, crit, config.DEVICE, thresholds,
                                      config.LABELS, config.USE_FP16,
                                      show_report=(ep >= epochs - 2))

            print(f"  Val with tuned thresholds: F1={val_f1_tuned:.4f}")

            if val_f1_tuned > best_f1:
                best_f1, best_thresholds, p_cnt = val_f1_tuned, thresholds, 0
                # Save with only tensors (convert threshold list to tensor)
                torch.save({
                    'model': model.state_dict(),
                    'thresholds': torch.tensor(thresholds, dtype=torch.float32),
                    'f1': torch.tensor(val_f1_tuned, dtype=torch.float32)
                }, f"{config.OUTPUT_DIR}/best_{lang}.pt")
                print(f"  ✓ Saved (F1={best_f1:.4f})")
            else:
                p_cnt += 1
                print(f"  No improvement ({p_cnt}/{patience})")

        if ep >= 6 and p_cnt >= patience:
            print(f"  Early stopping at epoch {ep+1}")
            break

    # Load best (now with weights_only=True support)
    ckpt = torch.load(f"{config.OUTPUT_DIR}/best_{lang}.pt", weights_only=True)
    model.load_state_dict(ckpt['model'])
    best_f1 = float(ckpt['f1'].item())
    best_thresholds = ckpt['thresholds'].tolist()

    print(f"\n{'='*70}")
    print(f"FINAL: {lang.upper()} F1-Macro={best_f1:.4f}")
    print(f"{'='*70}\n")

    return model, tokenizer, best_f1, best_thresholds

def predict_with_mcd(model, ids, mask, n_samples=5):
    """Multi-sample dropout for uncertainty estimation"""
    model.train()  # Enable dropout
    predictions = []

    with torch.no_grad():
        for _ in range(n_samples):
            logits = model(ids, mask)
            probs = torch.sigmoid(logits.float())
            predictions.append(probs)

    predictions = torch.stack(predictions)
    mean_pred = predictions.mean(dim=0)

    return mean_pred

def predict(model, tokenizer, test_file, out_file, thresholds, label_names, config):
    print(f"\nPredicting: {test_file}")

    df = pd.read_csv(test_file)
    df['text'] = df['text'].apply(TextPreprocessor().clean)

    for label in label_names:
        df[label] = 0

    ds = MultiLabelDataset(df['text'].values, df, tokenizer, config.MAX_LENGTH, label_names)
    loader = DataLoader(ds, config.BATCH_SIZE_ENG*2, shuffle=False,
                       num_workers=2, pin_memory=True)

    model.eval()
    all_probs = []

    print(f"  Using multi-sample dropout ({config.INFERENCE_SAMPLES} samples)...")

    for batch in tqdm(loader, desc="  Predict", leave=False):
        ids = batch['input_ids'].to(config.DEVICE)
        mask = batch['attention_mask'].to(config.DEVICE)

        mean_probs = predict_with_mcd(model, ids, mask, n_samples=config.INFERENCE_SAMPLES)
        all_probs.append(mean_probs.cpu().numpy())

    all_probs = np.vstack(all_probs)

    # Apply thresholds
    predictions = np.zeros_like(all_probs, dtype=int)
    for i, thresh in enumerate(thresholds):
        predictions[:, i] = (all_probs[:, i] >= thresh).astype(int)

    out_df = pd.DataFrame({'id': df['id']})
    for i, label in enumerate(label_names):
        out_df[label] = predictions[:, i]

    out_df.to_csv(out_file, index=False)

    print(f"✓ Saved: {out_file}")
    print(f"  Prediction Statistics:")
    for i, label in enumerate(label_names):
        count = predictions[:, i].sum()
        pct = count / len(predictions) * 100
        print(f"    {label:20s}: {count:5d} ({pct:5.2f}%) [t={thresholds[i]:.3f}]")

if __name__ == "__main__":
    print("\n" + "="*70)
    print("SemEval Task 9 - Subtask 3: FIXED")
    print("="*70)

    # ENGLISH
    print("\n📊 Loading English data...")
    eng_train = load_data(Config.TRAIN_ENG, Config.LABELS)

    eng_model, eng_tok, eng_f1, eng_t = train_model(
        eng_train, 'english', Config.MODEL_ENG, Config.LR_ENG,
        Config.BATCH_SIZE_ENG, Config.EPOCHS_ENG, Config
    )

    predict(eng_model, eng_tok, Config.DEV_ENG,
           f"{Config.PREDICTIONS_DIR}/pred_eng.csv", eng_t, Config.LABELS, Config)

    del eng_model
    gc.collect()
    torch.cuda.empty_cache()

    # KISWAHILI
    print("\n📊 Loading Kiswahili data...")
    swa_train = load_data(Config.TRAIN_SWA, Config.LABELS)

    swa_model, swa_tok, swa_f1, swa_t = train_model(
        swa_train, 'kiswahili', Config.MODEL_SWA, Config.LR_SWA,
        Config.BATCH_SIZE_SWA, Config.EPOCHS_SWA, Config
    )

    predict(swa_model, swa_tok, Config.DEV_SWA,
           f"{Config.PREDICTIONS_DIR}/pred_swa.csv", swa_t, Config.LABELS, Config)

    print("\n" + "="*70)
    print("FINAL RESULTS")
    print("="*70)
    print(f"English:   Val F1={eng_f1:.4f}")
    print(f"Kiswahili: Val F1={swa_f1:.4f}")
    print(f"Average:   {(eng_f1+swa_f1)/2:.4f}")
    print("="*70)

    print("\n📦 Creating submission...")
    import zipfile
    with zipfile.ZipFile('subtask_3.zip', 'w') as zipf:
        zipf.write(f"{Config.PREDICTIONS_DIR}/pred_eng.csv", 'subtask_3/pred_eng.csv')
        zipf.write(f"{Config.PREDICTIONS_DIR}/pred_swa.csv", 'subtask_3/pred_swa.csv')
    print("✅ DONE! Download subtask_3.zip")

Mounted at /content/drive
🚀 Device: cuda
   GPU: Tesla T4
   Memory: 15.8 GB

SemEval Task 9 - Subtask 3: FIXED

📊 Loading English data...
  Samples: 3222
  Label Distribution:
    stereotype          :   487 (15.11%)
    vilification        :   858 (26.63%)
    dehumanization      :   391 (12.14%)
    extreme_language    :   770 (23.90%)
    lack_of_empathy     :   357 (11.08%)
    invalidation        :   586 (18.19%)
  Samples with ≥1 label: 1175 (36.5%)

TRAINING: ENGLISH

  Train: 2577, Val: 645

  Class weights: {'stereotype': np.float64(1.0580733937007951), 'vilification': np.float64(0.7385401013138798), 'dehumanization': np.float64(1.195514130646282), 'extreme_language': np.float64(0.797726960126388), 'lack_of_empathy': np.float64(1.2752491751204493), 'invalidation': np.float64(0.9348962390922066)}


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/286M [00:00<?, ?B/s]


  Using Weighted Focal Loss (α=0.25, γ=2.0)

[Epoch 1/12]


  Train:   0%|          | 0/162 [00:00<?, ?it/s]

  Eval:   0%|          | 0/21 [00:00<?, ?it/s]

  Train: Loss=0.1070, F1=0.1998
  Val:   Loss=0.0812, F1=0.0653 (thresh=0.5)

[Epoch 2/12]


  Train:   0%|          | 0/162 [00:00<?, ?it/s]

  Eval:   0%|          | 0/21 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ebbf2b72520>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1654, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1637, in _shutdown_workers
    if w.is_alive():
Exception ignored in:   <function _MultiProcessingDataLoaderIter.__del__ at 0x7ebbf2b72520>
 Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1654, in __del__
     self._shutdown_workers()  
   File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1637, in _shutdown_workers
^^    if w.is_alive():^^
^^  ^ ^^ ^^ ^ 
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
     ^assert self._parent_pid == os.getpid(), 'can only test a child process'^
 ^  ^^  ^ ^ ^ ^^ ^^ 
   File "/usr

  Train: Loss=0.0534, F1=0.0535
  Val:   Loss=0.0423, F1=0.0000 (thresh=0.5)

[Epoch 3/12]


  Train:   0%|          | 0/162 [00:00<?, ?it/s]

  Eval:   0%|          | 0/21 [00:00<?, ?it/s]

  Train: Loss=0.0403, F1=0.0133
  Val:   Loss=0.0374, F1=0.0000 (thresh=0.5)

[Epoch 4/12]


  Train:   0%|          | 0/162 [00:00<?, ?it/s]

  Eval:   0%|          | 0/21 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ebbf2b72520>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1654, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1637, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ebbf2b72520>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1654, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 16

  Train: Loss=0.0366, F1=0.0281
  Val:   Loss=0.0378, F1=0.0000 (thresh=0.5)
  Finding optimal thresholds...


  Collecting:   0%|          | 0/21 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ebbf2b72520>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1654, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1637, in _shutdown_workers
    Exception ignored in: if w.is_alive():<function _MultiProcessingDataLoaderIter.__del__ at 0x7ebbf2b72520>

Traceback (most recent call last):
   File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1654, in __del__
      self._shutdown_workers() 
    File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1637, in _shutdown_workers
 ^    ^if w.is_alive():^
^ ^^  ^^^ ^ ^^  
^  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
^    ^assert self._parent_pid == os.getpid(), 'can only test a child process'^
^ ^ ^ ^  ^^   ^^   
  File "/usr

    stereotype          : t=0.330, F1=0.4684 (pos=99)
    vilification        : t=0.310, F1=0.6326 (pos=170)
    dehumanization      : t=0.320, F1=0.4429 (pos=77)
    extreme_language    : t=0.255, F1=0.6112 (pos=157)
    lack_of_empathy     : t=0.255, F1=0.3562 (pos=77)
    invalidation        : t=0.320, F1=0.4437 (pos=109)


  Eval:   0%|          | 0/21 [00:00<?, ?it/s]

  Val with tuned thresholds: F1=0.4925
  ✓ Saved (F1=0.4925)

[Epoch 5/12]


  Train:   0%|          | 0/162 [00:00<?, ?it/s]

  Eval:   0%|          | 0/21 [00:00<?, ?it/s]

  Train: Loss=0.0356, F1=0.0369
  Val:   Loss=0.0374, F1=0.0000 (thresh=0.5)
  Finding optimal thresholds...


  Collecting:   0%|          | 0/21 [00:00<?, ?it/s]

    stereotype          : t=0.260, F1=0.4845 (pos=99)
    vilification        : t=0.335, F1=0.6539 (pos=170)
    dehumanization      : t=0.310, F1=0.4115 (pos=77)
    extreme_language    : t=0.280, F1=0.6192 (pos=157)
    lack_of_empathy     : t=0.215, F1=0.3750 (pos=77)
    invalidation        : t=0.250, F1=0.4405 (pos=109)


  Eval:   0%|          | 0/21 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ebbf2b72520>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1654, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1637, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ebbf2b72520>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1654, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 16

  Val with tuned thresholds: F1=0.4963
  ✓ Saved (F1=0.4963)

[Epoch 6/12]


  Train:   0%|          | 0/162 [00:00<?, ?it/s]

  Eval:   0%|          | 0/21 [00:00<?, ?it/s]

  Train: Loss=0.0328, F1=0.0723
  Val:   Loss=0.0382, F1=0.0019 (thresh=0.5)
  Finding optimal thresholds...


  Collecting:   0%|          | 0/21 [00:00<?, ?it/s]

    stereotype          : t=0.330, F1=0.4854 (pos=99)
    vilification        : t=0.285, F1=0.6512 (pos=170)
    dehumanization      : t=0.280, F1=0.4151 (pos=77)
    extreme_language    : t=0.230, F1=0.6184 (pos=157)
    lack_of_empathy     : t=0.205, F1=0.3730 (pos=77)
    invalidation        : t=0.255, F1=0.4432 (pos=109)


  Eval:   0%|          | 0/21 [00:00<?, ?it/s]

  Val with tuned thresholds: F1=0.4965
  ✓ Saved (F1=0.4965)

[Epoch 7/12]


  Train:   0%|          | 0/162 [00:00<?, ?it/s]

  Eval:   0%|          | 0/21 [00:00<?, ?it/s]

  Train: Loss=0.0324, F1=0.0754
  Val:   Loss=0.0385, F1=0.0609 (thresh=0.5)
  Finding optimal thresholds...


  Collecting:   0%|          | 0/21 [00:00<?, ?it/s]

    stereotype          : t=0.235, F1=0.4805 (pos=99)
    vilification        : t=0.290, F1=0.6620 (pos=170)
    dehumanization      : t=0.310, F1=0.4138 (pos=77)
    extreme_language    : t=0.270, F1=0.6203 (pos=157)
    lack_of_empathy     : t=0.190, F1=0.3758 (pos=77)
    invalidation        : t=0.245, F1=0.4468 (pos=109)


  Eval:   0%|          | 0/21 [00:00<?, ?it/s]

  Val with tuned thresholds: F1=0.4999
  ✓ Saved (F1=0.4999)

[Epoch 8/12]


  Train:   0%|          | 0/162 [00:00<?, ?it/s]

  Eval:   0%|          | 0/21 [00:00<?, ?it/s]

  Train: Loss=0.0305, F1=0.1090
  Val:   Loss=0.0407, F1=0.0530 (thresh=0.5)
  Finding optimal thresholds...


  Collecting:   0%|          | 0/21 [00:00<?, ?it/s]

    stereotype          : t=0.325, F1=0.4840 (pos=99)
    vilification        : t=0.225, F1=0.6511 (pos=170)
    dehumanization      : t=0.315, F1=0.4138 (pos=77)
    extreme_language    : t=0.215, F1=0.6139 (pos=157)
    lack_of_empathy     : t=0.160, F1=0.3708 (pos=77)
    invalidation        : t=0.195, F1=0.4479 (pos=109)


  Eval:   0%|          | 0/21 [00:00<?, ?it/s]

  Val with tuned thresholds: F1=0.4969
  No improvement (1/4)

[Epoch 9/12]


  Train:   0%|          | 0/162 [00:00<?, ?it/s]

  Eval:   0%|          | 0/21 [00:00<?, ?it/s]

  Train: Loss=0.0297, F1=0.1216
  Val:   Loss=0.0389, F1=0.0713 (thresh=0.5)
  Finding optimal thresholds...


  Collecting:   0%|          | 0/21 [00:00<?, ?it/s]

    stereotype          : t=0.350, F1=0.4821 (pos=99)
    vilification        : t=0.275, F1=0.6544 (pos=170)
    dehumanization      : t=0.305, F1=0.4098 (pos=77)
    extreme_language    : t=0.260, F1=0.6203 (pos=157)
    lack_of_empathy     : t=0.220, F1=0.3763 (pos=77)
    invalidation        : t=0.235, F1=0.4479 (pos=109)


  Eval:   0%|          | 0/21 [00:00<?, ?it/s]

  Val with tuned thresholds: F1=0.4985
  No improvement (2/4)

[Epoch 10/12]


  Train:   0%|          | 0/162 [00:00<?, ?it/s]

  Eval:   0%|          | 0/21 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ebbf2b72520>Exception ignored in: 
Traceback (most recent call last):
<function _MultiProcessingDataLoaderIter.__del__ at 0x7ebbf2b72520>
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1654, in __del__
Traceback (most recent call last):
      File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1654, in __del__
self._shutdown_workers()
      File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1637, in _shutdown_workers
self._shutdown_workers()
    if w.is_alive():  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1637, in _shutdown_workers

       if w.is_alive(): 
     ^  ^ ^ ^ ^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
^    ^assert self._parent_pid == os.getpid(), 'can only test a child process'
^ ^   
   File "/usr/lib/p

  Train: Loss=0.0298, F1=0.1292
  Val:   Loss=0.0401, F1=0.0735 (thresh=0.5)
  Finding optimal thresholds...


  Collecting:   0%|          | 0/21 [00:00<?, ?it/s]

    stereotype          : t=0.335, F1=0.4933 (pos=99)
    vilification        : t=0.240, F1=0.6560 (pos=170)
    dehumanization      : t=0.280, F1=0.4171 (pos=77)
    extreme_language    : t=0.260, F1=0.6211 (pos=157)
    lack_of_empathy     : t=0.200, F1=0.3737 (pos=77)
    invalidation        : t=0.210, F1=0.4503 (pos=109)


  Eval:   0%|          | 0/21 [00:00<?, ?it/s]

  Val with tuned thresholds: F1=0.5015
  ✓ Saved (F1=0.5015)

[Epoch 11/12]


  Train:   0%|          | 0/162 [00:00<?, ?it/s]

  Eval:   0%|          | 0/21 [00:00<?, ?it/s]

  Train: Loss=0.0290, F1=0.1255
  Val:   Loss=0.0394, F1=0.0792 (thresh=0.5)
  Finding optimal thresholds...


  Collecting:   0%|          | 0/21 [00:00<?, ?it/s]

    stereotype          : t=0.350, F1=0.4825 (pos=99)
    vilification        : t=0.270, F1=0.6544 (pos=170)
    dehumanization      : t=0.295, F1=0.4163 (pos=77)
    extreme_language    : t=0.265, F1=0.6205 (pos=157)
    lack_of_empathy     : t=0.225, F1=0.3803 (pos=77)
    invalidation        : t=0.240, F1=0.4485 (pos=109)


  Eval:   0%|          | 0/21 [00:00<?, ?it/s]


  Per-label Performance:
    stereotype          : F1=0.4825 (pred= 129, true=  99)
    vilification        : F1=0.6544 (pred= 264, true= 170)
    dehumanization      : F1=0.4144 (pred= 145, true=  77)
    extreme_language    : F1=0.6205 (pred= 233, true= 157)
    lack_of_empathy     : F1=0.3803 (pred= 207, true=  77)
    invalidation        : F1=0.4485 (pred= 270, true= 109)

  Macro F1: 0.5001
  Val with tuned thresholds: F1=0.5001
  No improvement (1/4)

[Epoch 12/12]


  Train:   0%|          | 0/162 [00:00<?, ?it/s]

  Eval:   0%|          | 0/21 [00:00<?, ?it/s]

  Train: Loss=0.0289, F1=0.1368
  Val:   Loss=0.0395, F1=0.0808 (thresh=0.5)
  Finding optimal thresholds...


  Collecting:   0%|          | 0/21 [00:00<?, ?it/s]

    stereotype          : t=0.350, F1=0.4825 (pos=99)
    vilification        : t=0.270, F1=0.6544 (pos=170)
    dehumanization      : t=0.295, F1=0.4144 (pos=77)
    extreme_language    : t=0.265, F1=0.6205 (pos=157)
    lack_of_empathy     : t=0.225, F1=0.3803 (pos=77)
    invalidation        : t=0.195, F1=0.4493 (pos=109)


  Eval:   0%|          | 0/21 [00:00<?, ?it/s]


  Per-label Performance:
    stereotype          : F1=0.4825 (pred= 129, true=  99)
    vilification        : F1=0.6544 (pred= 264, true= 170)
    dehumanization      : F1=0.4144 (pred= 145, true=  77)
    extreme_language    : F1=0.6205 (pred= 233, true= 157)
    lack_of_empathy     : F1=0.3803 (pred= 207, true=  77)
    invalidation        : F1=0.4493 (pred= 305, true= 109)

  Macro F1: 0.5002
  Val with tuned thresholds: F1=0.5002
  No improvement (2/4)

FINAL: ENGLISH F1-Macro=0.5015


Predicting: /content/drive/MyDrive/NLP/subtask3/dev/eng.csv
  Using multi-sample dropout (5 samples)...


  Predict:   0%|          | 0/5 [00:00<?, ?it/s]

✓ Saved: /content/subtask_3/pred_eng.csv
  Prediction Statistics:
    stereotype          :    29 (18.12%) [t=0.335]
    vilification        :    67 (41.88%) [t=0.240]
    dehumanization      :    33 (20.62%) [t=0.280]
    extreme_language    :    62 (38.75%) [t=0.260]
    lack_of_empathy     :    60 (37.50%) [t=0.200]
    invalidation        :    72 (45.00%) [t=0.210]

📊 Loading Kiswahili data...
  Samples: 6991
  Label Distribution:
    stereotype          :  2775 (39.69%)
    vilification        :  2883 (41.24%)
    dehumanization      :   893 (12.77%)
    extreme_language    :  1673 (23.93%)
    lack_of_empathy     :  2080 (29.75%)
    invalidation        :  1637 (23.42%)
  Samples with ≥1 label: 3504 (50.1%)

TRAINING: KISWAHILI

  Train: 5592, Val: 1399

  Class weights: {'stereotype': np.float64(0.7271160028929861), 'vilification': np.float64(0.7100589937065583), 'dehumanization': np.float64(1.5298572523054663), 'extreme_language': np.float64(1.0455265837024814), 'lack_of_empath

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]


  Using Weighted Focal Loss (α=0.25, γ=2.0)

[Epoch 1/15]


  Train:   0%|          | 0/466 [00:00<?, ?it/s]

  Eval:   0%|          | 0/59 [00:00<?, ?it/s]

  Train: Loss=0.0787, F1=0.2229
  Val:   Loss=0.0533, F1=0.0000 (thresh=0.5)

[Epoch 2/15]


  Train:   0%|          | 0/466 [00:00<?, ?it/s]

  Eval:   0%|          | 0/59 [00:00<?, ?it/s]

  Train: Loss=0.0565, F1=0.0049
  Val:   Loss=0.0512, F1=0.0000 (thresh=0.5)

[Epoch 3/15]


  Train:   0%|          | 0/466 [00:00<?, ?it/s]

  Eval:   0%|          | 0/59 [00:00<?, ?it/s]

  Train: Loss=0.0527, F1=0.0266
  Val:   Loss=0.0481, F1=0.0000 (thresh=0.5)

[Epoch 4/15]


  Train:   0%|          | 0/466 [00:00<?, ?it/s]

  Eval:   0%|          | 0/59 [00:00<?, ?it/s]

  Train: Loss=0.0491, F1=0.1055
  Val:   Loss=0.0462, F1=0.1791 (thresh=0.5)
  Finding optimal thresholds...


  Collecting:   0%|          | 0/59 [00:00<?, ?it/s]

    stereotype          : t=0.390, F1=0.6839 (pos=541)
    vilification        : t=0.380, F1=0.7031 (pos=585)
    dehumanization      : t=0.265, F1=0.3015 (pos=163)
    extreme_language    : t=0.295, F1=0.4637 (pos=312)
    lack_of_empathy     : t=0.305, F1=0.5784 (pos=420)
    invalidation        : t=0.360, F1=0.5226 (pos=332)


  Eval:   0%|          | 0/59 [00:00<?, ?it/s]

  Val with tuned thresholds: F1=0.5421
  ✓ Saved (F1=0.5421)

[Epoch 5/15]


  Train:   0%|          | 0/466 [00:00<?, ?it/s]

  Eval:   0%|          | 0/59 [00:00<?, ?it/s]

  Train: Loss=0.0467, F1=0.1642
  Val:   Loss=0.0466, F1=0.1894 (thresh=0.5)
  Finding optimal thresholds...


  Collecting:   0%|          | 0/59 [00:00<?, ?it/s]

    stereotype          : t=0.330, F1=0.6736 (pos=541)
    vilification        : t=0.355, F1=0.7162 (pos=585)
    dehumanization      : t=0.255, F1=0.3063 (pos=163)
    extreme_language    : t=0.305, F1=0.4844 (pos=312)
    lack_of_empathy     : t=0.320, F1=0.6031 (pos=420)
    invalidation        : t=0.355, F1=0.5177 (pos=332)


  Eval:   0%|          | 0/59 [00:00<?, ?it/s]

  Val with tuned thresholds: F1=0.5502
  ✓ Saved (F1=0.5502)

[Epoch 6/15]


  Train:   0%|          | 0/466 [00:00<?, ?it/s]

  Eval:   0%|          | 0/59 [00:00<?, ?it/s]

  Train: Loss=0.0448, F1=0.1861
  Val:   Loss=0.0464, F1=0.2081 (thresh=0.5)
  Finding optimal thresholds...


  Collecting:   0%|          | 0/59 [00:00<?, ?it/s]

    stereotype          : t=0.375, F1=0.6785 (pos=541)
    vilification        : t=0.380, F1=0.7264 (pos=585)
    dehumanization      : t=0.305, F1=0.3092 (pos=163)
    extreme_language    : t=0.350, F1=0.4910 (pos=312)
    lack_of_empathy     : t=0.330, F1=0.6096 (pos=420)
    invalidation        : t=0.370, F1=0.5345 (pos=332)


  Eval:   0%|          | 0/59 [00:00<?, ?it/s]

  Val with tuned thresholds: F1=0.5581
  ✓ Saved (F1=0.5581)

[Epoch 7/15]


  Train:   0%|          | 0/466 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ebbf2b72520>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1654, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1637, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ebbf2b72520>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1654, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 16

  Eval:   0%|          | 0/59 [00:00<?, ?it/s]

  Train: Loss=0.0428, F1=0.2047
  Val:   Loss=0.0473, F1=0.2209 (thresh=0.5)
  Finding optimal thresholds...


  Collecting:   0%|          | 0/59 [00:00<?, ?it/s]

    stereotype          : t=0.435, F1=0.6829 (pos=541)
    vilification        : t=0.460, F1=0.7303 (pos=585)
    dehumanization      : t=0.315, F1=0.3007 (pos=163)
    extreme_language    : t=0.385, F1=0.4843 (pos=312)
    lack_of_empathy     : t=0.365, F1=0.6011 (pos=420)
    invalidation        : t=0.385, F1=0.5370 (pos=332)


  Eval:   0%|          | 0/59 [00:00<?, ?it/s]

  Val with tuned thresholds: F1=0.5557
  No improvement (1/4)

[Epoch 8/15]


  Train:   0%|          | 0/466 [00:00<?, ?it/s]

  Eval:   0%|          | 0/59 [00:00<?, ?it/s]

  Train: Loss=0.0416, F1=0.2174
  Val:   Loss=0.0473, F1=0.2340 (thresh=0.5)
  Finding optimal thresholds...


  Collecting:   0%|          | 0/59 [00:00<?, ?it/s]

    stereotype          : t=0.370, F1=0.6770 (pos=541)
    vilification        : t=0.390, F1=0.7240 (pos=585)
    dehumanization      : t=0.290, F1=0.2943 (pos=163)
    extreme_language    : t=0.355, F1=0.4894 (pos=312)
    lack_of_empathy     : t=0.365, F1=0.5961 (pos=420)
    invalidation        : t=0.380, F1=0.5304 (pos=332)


  Eval:   0%|          | 0/59 [00:00<?, ?it/s]

  Val with tuned thresholds: F1=0.5519
  No improvement (2/4)

[Epoch 9/15]


  Train:   0%|          | 0/466 [00:00<?, ?it/s]

  Eval:   0%|          | 0/59 [00:00<?, ?it/s]

  Train: Loss=0.0396, F1=0.2558
  Val:   Loss=0.0485, F1=0.2422 (thresh=0.5)
  Finding optimal thresholds...


  Collecting:   0%|          | 0/59 [00:00<?, ?it/s]

    stereotype          : t=0.360, F1=0.6737 (pos=541)
    vilification        : t=0.420, F1=0.7235 (pos=585)
    dehumanization      : t=0.255, F1=0.2988 (pos=163)
    extreme_language    : t=0.350, F1=0.4911 (pos=312)
    lack_of_empathy     : t=0.380, F1=0.6012 (pos=420)
    invalidation        : t=0.340, F1=0.5318 (pos=332)


  Eval:   0%|          | 0/59 [00:00<?, ?it/s]

  Val with tuned thresholds: F1=0.5532
  No improvement (3/4)

[Epoch 10/15]


  Train:   0%|          | 0/466 [00:00<?, ?it/s]

  Eval:   0%|          | 0/59 [00:00<?, ?it/s]

  Train: Loss=0.0385, F1=0.2874
  Val:   Loss=0.0503, F1=0.2532 (thresh=0.5)
  Finding optimal thresholds...


  Collecting:   0%|          | 0/59 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ebbf2b72520>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1654, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1637, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^Exception ignored in: ^<function _MultiProcessingDataLoaderIter.__del__ at 0x7ebbf2b72520>
^Traceback (most recent call last):
^  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1654, in __del__
^^    ^self._shutdown_workers()
^  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1637, in _shutdown_workers
^    ^if w.is_alive():^
^ ^^  ^ 

    stereotype          : t=0.375, F1=0.6641 (pos=541)
    vilification        : t=0.380, F1=0.7107 (pos=585)
    dehumanization      : t=0.285, F1=0.2944 (pos=163)
    extreme_language    : t=0.305, F1=0.4715 (pos=312)
    lack_of_empathy     : t=0.380, F1=0.5869 (pos=420)
    invalidation        : t=0.360, F1=0.5089 (pos=332)


  Eval:   0%|          | 0/59 [00:00<?, ?it/s]

  Val with tuned thresholds: F1=0.5394
  No improvement (4/4)
  Early stopping at epoch 10

FINAL: KISWAHILI F1-Macro=0.5581


Predicting: /content/drive/MyDrive/NLP/subtask3/dev/swa.csv
  Using multi-sample dropout (5 samples)...


  Predict:   0%|          | 0/11 [00:00<?, ?it/s]

✓ Saved: /content/subtask_3/pred_swa.csv
  Prediction Statistics:
    stereotype          :   142 (40.69%) [t=0.375]
    vilification        :   145 (41.55%) [t=0.380]
    dehumanization      :    56 (16.05%) [t=0.305]
    extreme_language    :   100 (28.65%) [t=0.350]
    lack_of_empathy     :   147 (42.12%) [t=0.330]
    invalidation        :    84 (24.07%) [t=0.370]

FINAL RESULTS
English:   Val F1=0.5015
Kiswahili: Val F1=0.5581
Average:   0.5298

📦 Creating submission...
✅ DONE! Download subtask_3.zip
